diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -7,6 +7,7 @@
import json
import logging
import os
+import string
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union
from urllib.parse import quote
@@ -16,11 +17,9 @@
from swh.loader.package.loader import (
BasePackageInfo,
PackageLoader,
- PartialExtID,
RawExtrinsicMetadataCore,
)
from swh.loader.package.utils import api_info, cached_method, release_name
-from swh.model.hashutil import hash_to_bytes
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
@@ -38,18 +37,25 @@
EMPTY_PERSON = Person.from_fullname(b"")
-EXTID_TYPE = "npm-archive-sha1"
-EXTID_VERSION = 0
-
-
@attr.s
class NpmPackageInfo(BasePackageInfo):
raw_info = attr.ib(type=Dict[str, Any])
+ id_ = attr.ib(type=str)
+ """Unique id assigned by the registry for this version."""
+
date = attr.ib(type=Optional[str])
shasum = attr.ib(type=str)
"""sha1 checksum"""
+ # we cannot rely only on $shasum, as it is technically possible for two versions
+ # of the same package to have the exact same tarball.
+ # But the release data (message and date) are extrinsic to the content of the
+ # package, so they differ between versions.
+ MANIFEST_FORMAT = string.Template("$id_ $shasum")
+ EXTID_TYPE = "npm-archive-url-and-sha1"
+ EXTID_VERSION = 0
+
@classmethod
def from_metadata(
cls, project_metadata: Dict[str, Any], version: str
@@ -70,6 +76,7 @@
date = None
return cls(
+ id_=package_metadata["_id"],
url=url,
filename=os.path.basename(url),
date=date,
@@ -84,9 +91,6 @@
],
)
- def extid(self) -> PartialExtID:
- return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.shasum))
-
class NpmLoader(PackageLoader[NpmPackageInfo]):
"""Load npm origin's artifact releases into swh archive.
diff --git a/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz
new file mode 120000
--- /dev/null
+++ b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz
@@ -0,0 +1 @@
+org_-_org-0.0.3.tgz
\ No newline at end of file
diff --git a/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch
@@ -0,0 +1,141 @@
+{
+ "_id": "org",
+ "_rev": "4-22484cc537f12d3023241211ee34e39d",
+ "name": "org",
+ "description": "A parser and converter for org-mode notation",
+ "dist-tags": {
+ "latest": "0.0.3"
+ },
+ "versions": {
+ "0.0.3-beta": {
+ "name": "org",
+ "description": "A parser and converter for org-mode notation",
+ "homepage": "http://mooz.github.com/org-js",
+ "keywords": [
+ "org-mode",
+ "emacs",
+ "parser"
+ ],
+ "author": {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ },
+ "main": "./lib/org.js",
+ "version": "0.0.3-beta",
+ "directories": {
+ "test": "./tests"
+ },
+ "repository": {
+ "type": "git",
+ "url": "git://github.com/mooz/org-js.git"
+ },
+ "bugs": {
+ "url": "https://github.com/mooz/org-js/issues"
+ },
+ "_id": "org@0.0.3-beta",
+ "dist": {
+ "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0",
+ "tarball": "https://registry.npmjs.org/org/-/org-0.0.3-beta.tgz"
+ },
+ "_from": ".",
+ "_npmVersion": "1.2.25",
+ "_npmUser": {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ },
+ "maintainers": [
+ {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ }
+ ]
+ },
+ "0.0.3": {
+ "name": "org",
+ "description": "A parser and converter for org-mode notation",
+ "homepage": "http://mooz.github.com/org-js",
+ "bugs": {
+ "url": "http://github.com/mooz/org-s/issues"
+ },
+ "keywords": [
+ "org-mode",
+ "emacs",
+ "parser"
+ ],
+ "author": {
+ "name": "Masafumi Oyamada",
+ "email": "stillpedant@gmail.com",
+ "url": "http://mooz.github.io/"
+ },
+ "licenses": [
+ {
+ "type": "MIT"
+ }
+ ],
+ "main": "./lib/org.js",
+ "version": "0.0.3",
+ "directories": {
+ "test": "./tests"
+ },
+ "repository": {
+ "type": "git",
+ "url": "git://github.com/mooz/org-js.git"
+ },
+ "_id": "org@0.0.3",
+ "dist": {
+ "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0",
+ "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz"
+ },
+ "_from": ".",
+ "_npmVersion": "1.2.25",
+ "_npmUser": {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ },
+ "maintainers": [
+ {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ }
+ ]
+ }
+ },
+ "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n",
+ "maintainers": [
+ {
+ "name": "mooz",
+ "email": "stillpedant@gmail.com"
+ }
+ ],
+ "time": {
+ "modified": "2019-01-05T01:37:44Z",
+ "created": "2014-01-01T15:40:31Z",
+ "0.0.3-beta": "2014-01-01T15:40:33Z",
+ "0.0.3": "2014-01-01T15:55:45Z"
+ },
+ "author": {
+ "name": "Masafumi Oyamada",
+ "email": "stillpedant@gmail.com",
+ "url": "http://mooz.github.io/"
+ },
+ "repository": {
+ "type": "git",
+ "url": "git://github.com/mooz/org-js.git"
+ },
+ "users": {
+ "nak2k": true,
+ "bgschaid": true,
+ "422665vijay": true,
+ "nontau": true
+ },
+ "homepage": "http://mooz.github.com/org-js",
+ "keywords": [
+ "org-mode",
+ "emacs",
+ "parser"
+ ],
+ "bugs": {
+ "url": "http://github.com/mooz/org-s/issues"
+ },
+ "readmeFilename": "README.md"
+}
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -528,6 +528,94 @@
} == stats
+def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir):
+ """Test with two versions that have exactly the same tarball"""
+ package = "org_version_mismatch"
+ url = package_url(package)
+ loader = NpmLoader(swh_storage, url)
+
+ actual_load_status = loader.load()
+ expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a")
+ assert actual_load_status == {
+ "status": "eventful",
+ "snapshot_id": expected_snapshot_id.hex(),
+ }
+
+ assert_last_visit_matches(
+ swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
+ )
+
+ beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1"
+ release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0"
+ versions = [
+ ("0.0.3-beta", beta_release_id),
+ ("0.0.3", release_id),
+ ]
+
+ expected_snapshot = Snapshot(
+ id=expected_snapshot_id,
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=b"releases/0.0.3", target_type=TargetType.ALIAS
+ ),
+ **{
+ b"releases/"
+ + version_name.encode(): SnapshotBranch(
+ target=hash_to_bytes(version_id), target_type=TargetType.RELEASE,
+ )
+ for (version_name, version_id) in versions
+ },
+ },
+ )
+ check_snapshot(expected_snapshot, swh_storage)
+
+ assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release(
+ name=b"0.0.3-beta",
+ message=(
+ b"Synthetic release for NPM source package org_version_mismatch "
+ b"version 0.0.3-beta\n"
+ ),
+ target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
+ target_type=ModelObjectType.DIRECTORY,
+ synthetic=True,
+ author=Person.from_fullname(b"Masafumi Oyamada "),
+ date=TimestampWithTimezone.from_datetime(
+ datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
+ ),
+ id=hash_to_bytes(beta_release_id),
+ )
+
+ assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
+ name=b"0.0.3",
+ message=(
+ b"Synthetic release for NPM source package org_version_mismatch "
+ b"version 0.0.3\n"
+ ),
+ target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
+ target_type=ModelObjectType.DIRECTORY,
+ synthetic=True,
+ author=Person.from_fullname(b"Masafumi Oyamada "),
+ date=TimestampWithTimezone.from_datetime(
+ datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc)
+ ),
+ id=hash_to_bytes(release_id),
+ )
+
+ # Check incremental re-load keeps it unchanged
+
+ loader = NpmLoader(swh_storage, url)
+
+ actual_load_status = loader.load()
+ assert actual_load_status == {
+ "status": "uneventful",
+ "snapshot_id": expected_snapshot_id.hex(),
+ }
+
+ assert_last_visit_matches(
+ swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
+ )
+
+
def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion