No OneTemporary
Actions

Size

48 KB

Subscribers

None

View Options

	diff --git a/PKG-INFO b/PKG-INFO
	index 11b4266..67e16d6 100644
	--- a/PKG-INFO
	+++ b/PKG-INFO
	@@ -1,32 +1,32 @@
	Metadata-Version: 2.1
	Name: swh.loader.core
	-Version: 0.6.0
	+Version: 0.6.1
	Summary: Software Heritage Base Loader
	Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE
	Author: Software Heritage developers
	Author-email: swh-devel@inria.fr
	License: UNKNOWN
	Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
	Project-URL: Funding, https://www.softwareheritage.org/donate
	Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core
	Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/
	Description: SWH-loader-core
	===============

	The Software Heritage Core Loader is a low-level loading utilities and
	helpers used by other loaders.

	The main entry points are classes:
	- :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn)
	- :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...)
	- :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...)

	Platform: UNKNOWN
	Classifier: Programming Language :: Python :: 3
	Classifier: Intended Audience :: Developers
	Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
	Classifier: Operating System :: OS Independent
	Classifier: Development Status :: 5 - Production/Stable
	Requires-Python: >=3.7
	Description-Content-Type: text/markdown
	Provides-Extra: testing
	diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO
	index 11b4266..67e16d6 100644
	--- a/swh.loader.core.egg-info/PKG-INFO
	+++ b/swh.loader.core.egg-info/PKG-INFO
	@@ -1,32 +1,32 @@
	Metadata-Version: 2.1
	Name: swh.loader.core
	-Version: 0.6.0
	+Version: 0.6.1
	Summary: Software Heritage Base Loader
	Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE
	Author: Software Heritage developers
	Author-email: swh-devel@inria.fr
	License: UNKNOWN
	Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
	Project-URL: Funding, https://www.softwareheritage.org/donate
	Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core
	Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/
	Description: SWH-loader-core
	===============

	The Software Heritage Core Loader is a low-level loading utilities and
	helpers used by other loaders.

	The main entry points are classes:
	- :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn)
	- :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...)
	- :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...)

	Platform: UNKNOWN
	Classifier: Programming Language :: Python :: 3
	Classifier: Intended Audience :: Developers
	Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
	Classifier: Operating System :: OS Independent
	Classifier: Development Status :: 5 - Production/Stable
	Requires-Python: >=3.7
	Description-Content-Type: text/markdown
	Provides-Extra: testing
	diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
	index afa682c..8c4f8c4 100644
	--- a/swh/loader/core/loader.py
	+++ b/swh/loader/core/loader.py
	@@ -1,430 +1,430 @@
	# Copyright (C) 2015-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import datetime
	import hashlib
	import logging
	import os

	from abc import ABCMeta, abstractmethod
	from typing import Any, Dict, Iterable, Optional, Tuple

	from swh.core import config
	from swh.model.model import (
	BaseContent,
	Content,
	SkippedContent,
	Directory,
	Origin,
	OriginVisit,
	OriginVisitStatus,
	Revision,
	Release,
	Sha1Git,
	Snapshot,
	)
	from swh.storage import get_storage
	from swh.storage.utils import now


	class BaseLoader(config.SWHConfig, metaclass=ABCMeta):
	"""Mixin base class for loader.

	To use this class, you must:

	- inherit from this class
	- and implement the @abstractmethod methods:

	- :func:`prepare`: First step executed by the loader to prepare some
	state needed by the `func`:load method.

	- :func:`get_origin`: Retrieve the origin that is currently being loaded.

	- :func:`fetch_data`: Fetch the data is actually the method to implement
	to compute data to inject in swh (through the store_data method)

	- :func:`store_data`: Store data fetched.

	- :func:`visit_status`: Explicit status of the visit ('partial' or
	'full')

	- :func:`load_status`: Explicit status of the loading, for use by the
	scheduler (eventful/uneventful/temporary failure/permanent failure).

	- :func:`cleanup`: Last step executed by the loader.

	The entry point for the resulting loader is :func:`load`.

	You can take a look at some example classes:

	- :class:`BaseSvnLoader`

	"""

	CONFIG_BASE_FILENAME = None # type: Optional[str]

	DEFAULT_CONFIG = {
	"storage": ("dict", {"cls": "remote", "url": "http://localhost:5002/",}),
	"max_content_size": ("int", 100 * 1024 * 1024),
	"save_data": ("bool", False),
	"save_data_path": ("str", ""),
	} # type: Dict[str, Tuple[str, Any]]

	ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]

	def __init__(
	self, logging_class: Optional[str] = None, config: Dict[str, Any] = {}
	):
	if config:
	self.config = config
	else:
	self.config = self.parse_config_file(
	additional_configs=[self.ADDITIONAL_CONFIG]
	)

	self.storage = get_storage(**self.config["storage"])

	if logging_class is None:
	logging_class = "%s.%s" % (
	self.__class__.__module__,
	self.__class__.__name__,
	)
	self.log = logging.getLogger(logging_class)

	_log = logging.getLogger("requests.packages.urllib3.connectionpool")
	_log.setLevel(logging.WARN)

	self.max_content_size = self.config["max_content_size"]

	# possibly overridden in self.prepare method
	self.visit_date: Optional[datetime.datetime] = None

	self.origin: Optional[Origin] = None

	if not hasattr(self, "visit_type"):
	self.visit_type: Optional[str] = None

	self.origin_metadata: Dict[str, Any] = {}

	self.loaded_snapshot_id: Optional[Sha1Git] = None

	# Make sure the config is sane
	save_data = self.config.get("save_data")
	if save_data:
	path = self.config["save_data_path"]
	os.stat(path)
	if not os.access(path, os.R_OK \| os.W_OK):
	raise PermissionError("Permission denied: %r" % path)

	def save_data(self) -> None:
	"""Save the data associated to the current load"""
	raise NotImplementedError

	def get_save_data_path(self) -> str:
	"""The path to which we archive the loader's raw data"""
	if not hasattr(self, "__save_data_path"):
	year = str(self.visit_date.year) # type: ignore

	assert self.origin
	url = self.origin.url.encode("utf-8")
	origin_url_hash = hashlib.sha1(url).hexdigest()

	path = "%s/sha1:%s/%s/%s" % (
	self.config["save_data_path"],
	origin_url_hash[0:2],
	origin_url_hash,
	year,
	)

	os.makedirs(path, exist_ok=True)
	self.__save_data_path = path

	return self.__save_data_path

	def flush(self) -> None:
	"""Flush any potential buffered data not sent to swh-storage.

	"""
	self.storage.flush()

	@abstractmethod
	def cleanup(self) -> None:
	"""Last step executed by the loader.

	"""
	pass

	@abstractmethod
	def prepare_origin_visit(self, args, *kwargs) -> None:
	"""First step executed by the loader to prepare origin and visit
	references. Set/update self.origin, and
	optionally self.origin_url, self.visit_date.

	"""
	pass

	def _store_origin_visit(self) -> None:
	"""Store origin and visit references. Sets the self.visit references.

	"""
	assert self.origin
	self.storage.origin_add([self.origin])

	if not self.visit_date: # now as default visit_date if not provided
	self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
	assert isinstance(self.visit_date, datetime.datetime)
	assert isinstance(self.visit_type, str)
	self.visit = self.storage.origin_visit_add(
	[
	OriginVisit(
	origin=self.origin.url, date=self.visit_date, type=self.visit_type,
	)
	]
	)[0]

	@abstractmethod
	def prepare(self, args, *kwargs) -> None:
	"""Second step executed by the loader to prepare some state needed by
	the loader.

	"""
	pass

	def get_origin(self) -> Origin:
	"""Get the origin that is currently being loaded.
	self.origin should be set in :func:`prepare_origin`

	Returns:
	dict: an origin ready to be sent to storage by
	- :func:`origin_add_one`.
	+ :func:`origin_add`.
	"""
	assert self.origin
	return self.origin

	@abstractmethod
	def fetch_data(self) -> bool:
	"""Fetch the data from the source the loader is currently loading
	(ex: git/hg/svn/... repository).

	Returns:
	a value that is interpreted as a boolean. If True, fetch_data needs
	to be called again to complete loading.

	"""
	pass

	@abstractmethod
	def store_data(self):
	"""Store fetched data in the database.

	Should call the :func:`maybe_load_xyz` methods, which handle the
	bundles sent to storage, rather than send directly.
	"""
	pass

	def store_metadata(self) -> None:
	"""Store fetched metadata in the database.

	For more information, see implementation in :class:`DepositLoader`.
	"""
	pass

	def load_status(self) -> Dict[str, str]:
	"""Detailed loading status.

	Defaults to logging an eventful load.

	Returns: a dictionary that is eventually passed back as the task's
	result to the scheduler, allowing tuning of the task recurrence
	mechanism.
	"""
	return {
	"status": "eventful",
	}

	def post_load(self, success: bool = True) -> None:
	"""Permit the loader to do some additional actions according to status
	after the loading is done. The flag success indicates the
	loading's status.

	Defaults to doing nothing.

	This is up to the implementer of this method to make sure this
	does not break.

	Args:
	success (bool): the success status of the loading

	"""
	pass

	def visit_status(self) -> str:
	"""Detailed visit status.

	Defaults to logging a full visit.
	"""
	return "full"

	def pre_cleanup(self) -> None:
	"""As a first step, will try and check for dangling data to cleanup.
	This should do its best to avoid raising issues.

	"""
	pass

	def load(self, args, *kwargs) -> Dict[str, str]:
	r"""Loading logic for the loader to follow:

	- 1. Call :meth:`prepare_origin_visit` to prepare the
	origin and visit we will associate loading data to
	- 2. Store the actual ``origin_visit`` to storage
	- 3. Call :meth:`prepare` to prepare any eventual state
	- 4. Call :meth:`get_origin` to get the origin we work with and store

	- while True:

	- 5. Call :meth:`fetch_data` to fetch the data to store
	- 6. Call :meth:`store_data` to store the data

	- 7. Call :meth:`cleanup` to clean up any eventual state put in place
	in :meth:`prepare` method.

	"""
	try:
	self.pre_cleanup()
	except Exception:
	msg = "Cleaning up dangling data failed! Continue loading."
	self.log.warning(msg)

	self.prepare_origin_visit(args, *kwargs)
	self._store_origin_visit()

	assert self.origin

	try:
	self.prepare(args, *kwargs)

	while True:
	more_data_to_fetch = self.fetch_data()
	self.store_data()
	if not more_data_to_fetch:
	break

	self.store_metadata()
	visit_status = OriginVisitStatus(
	origin=self.origin.url,
	visit=self.visit.visit,
	date=now(),
	status=self.visit_status(),
	snapshot=self.loaded_snapshot_id,
	)
	self.storage.origin_visit_status_add([visit_status])
	self.post_load()
	except Exception:
	self.log.exception(
	"Loading failure, updating to `partial` status",
	extra={"swh_task_args": args, "swh_task_kwargs": kwargs,},
	)
	visit_status = OriginVisitStatus(
	origin=self.origin.url,
	visit=self.visit.visit,
	date=now(),
	status="partial",
	snapshot=self.loaded_snapshot_id,
	)
	self.storage.origin_visit_status_add([visit_status])
	self.post_load(success=False)
	return {"status": "failed"}
	finally:
	self.flush()
	self.cleanup()

	return self.load_status()


	class DVCSLoader(BaseLoader):
	"""This base class is a pattern for dvcs loaders (e.g. git, mercurial).

	Those loaders are able to load all the data in one go. For example, the
	loader defined in swh-loader-git :class:`BulkUpdater`.

	For other loaders (stateful one, (e.g :class:`SWHSvnLoader`),
	inherit directly from :class:`BaseLoader`.

	"""

	ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]

	def cleanup(self) -> None:
	"""Clean up an eventual state installed for computations."""
	pass

	def has_contents(self) -> bool:
	"""Checks whether we need to load contents"""
	return True

	def get_contents(self) -> Iterable[BaseContent]:
	"""Get the contents that need to be loaded"""
	raise NotImplementedError

	def has_directories(self) -> bool:
	"""Checks whether we need to load directories"""
	return True

	def get_directories(self) -> Iterable[Directory]:
	"""Get the directories that need to be loaded"""
	raise NotImplementedError

	def has_revisions(self) -> bool:
	"""Checks whether we need to load revisions"""
	return True

	def get_revisions(self) -> Iterable[Revision]:
	"""Get the revisions that need to be loaded"""
	raise NotImplementedError

	def has_releases(self) -> bool:
	"""Checks whether we need to load releases"""
	return True

	def get_releases(self) -> Iterable[Release]:
	"""Get the releases that need to be loaded"""
	raise NotImplementedError

	def get_snapshot(self) -> Snapshot:
	"""Get the snapshot that needs to be loaded"""
	raise NotImplementedError

	def eventful(self) -> bool:
	"""Whether the load was eventful"""
	raise NotImplementedError

	def store_data(self) -> None:
	assert self.origin
	if self.config.get("save_data"):
	self.save_data()

	if self.has_contents():
	contents = []
	skipped_contents = []
	for obj in self.get_contents():
	if isinstance(obj, Content):
	contents.append(obj)
	elif isinstance(obj, SkippedContent):
	skipped_contents.append(obj)
	else:
	raise TypeError(f"Unexpected content type: {obj}")
	self.storage.skipped_content_add(skipped_contents)
	self.storage.content_add(contents)
	if self.has_directories():
	self.storage.directory_add(self.get_directories())
	if self.has_revisions():
	self.storage.revision_add(self.get_revisions())
	if self.has_releases():
	self.storage.release_add(self.get_releases())
	snapshot = self.get_snapshot()
	self.storage.snapshot_add([snapshot])
	self.flush()
	self.loaded_snapshot_id = snapshot.id
	diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
	index b6eb5e0..5591edc 100644
	--- a/swh/loader/package/npm/loader.py
	+++ b/swh/loader/package/npm/loader.py
	@@ -1,287 +1,302 @@
	-# Copyright (C) 2019 The Software Heritage developers
	+# Copyright (C) 2019-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import json
	import logging
	import os

	from codecs import BOM_UTF8
	-from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
	+from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union

	import attr
	import chardet

	from urllib.parse import quote
	from swh.model.model import (
	Person,
	RevisionType,
	Revision,
	TimestampWithTimezone,
	Sha1Git,
	)

	from swh.loader.package.loader import PackageLoader
	from swh.loader.package.utils import api_info, release_name


	logger = logging.getLogger(__name__)


	+EMPTY_PERSON = Person(fullname=b"", name=None, email=None)
	+
	+
	class NpmLoader(PackageLoader):
	"""Load npm origin's artifact releases into swh archive.

	"""

	visit_type = "npm"

	def __init__(self, url: str):
	"""Constructor

	Args
	str: origin url (e.g. https://www.npmjs.com/package/<package-name>)
	"""
	super().__init__(url=url)
	package_name = url.split("https://www.npmjs.com/package/")[1]
	safe_name = quote(package_name, safe="")
	self.provider_url = f"https://replicate.npmjs.com/{safe_name}/"
	self._info: Dict[str, Any] = {}
	self._versions = None

	@property
	def info(self) -> Dict[str, Any]:
	"""Return the project metadata information (fetched from npm registry)

	"""
	if not self._info:
	self._info = api_info(self.provider_url)
	return self._info

	def get_versions(self) -> Sequence[str]:
	return sorted(list(self.info["versions"].keys()))

	def get_default_version(self) -> str:
	return self.info["dist-tags"].get("latest", "")

	def get_package_info(
	self, version: str
	) -> Generator[Tuple[str, Mapping[str, Any]], None, None]:
	meta = self.info["versions"][version]
	url = meta["dist"]["tarball"]
	p_info = {
	"url": url,
	"filename": os.path.basename(url),
	"raw": meta,
	}
	yield release_name(version), p_info

	def resolve_revision_from(
	self, known_artifacts: Dict, artifact_metadata: Dict
	) -> Optional[bytes]:
	return artifact_to_revision_id(known_artifacts, artifact_metadata)

	def build_revision(
	self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git
	) -> Optional[Revision]:
	i_metadata = extract_intrinsic_metadata(uncompressed_path)
	if not i_metadata:
	return None
	# from intrinsic metadata
	author = extract_npm_package_author(i_metadata)
	message = i_metadata["version"].encode("ascii")

	# from extrinsic metadata

	# No date available in intrinsic metadata: retrieve it from the API
	# metadata, using the version number that the API claims this package
	# has.
	extrinsic_version = a_metadata["version"]

	if "time" in self.info:
	date = self.info["time"][extrinsic_version]
	elif "mtime" in a_metadata:
	date = a_metadata["mtime"]
	else:
	artifact_name = os.path.basename(a_metadata["dist"]["tarball"])
	raise ValueError(
	"Origin %s: Cannot determine upload time for artifact %s."
	% (self.url, artifact_name)
	)

	date = TimestampWithTimezone.from_iso8601(date)

	# FIXME: this is to remain bug-compatible with earlier versions:
	date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0))

	r = Revision(
	type=RevisionType.TAR,
	message=message,
	author=author,
	date=date,
	committer=author,
	committer_date=date,
	parents=(),
	directory=directory,
	synthetic=True,
	metadata={
	"intrinsic": {"tool": "package.json", "raw": i_metadata,},
	"extrinsic": {
	"provider": self.provider_url,
	"when": self.visit_date.isoformat(),
	"raw": a_metadata,
	},
	},
	)
	return r


	def artifact_to_revision_id(
	known_artifacts: Dict, artifact_metadata: Dict
	) -> Optional[bytes]:
	"""Given metadata artifact, solves the associated revision id.

	The following code allows to deal with 2 metadata formats:

	- old format sample::

	{
	'package_source': {
	'sha1': '05181c12cd8c22035dd31155656826b85745da37',
	}
	}

	- new format sample::

	{
	'original_artifact': [{
	'checksums': {
	'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
	...
	},
	}],
	...
	}

	"""
	shasum = artifact_metadata["dist"]["shasum"]
	for rev_id, known_artifact in known_artifacts.items():
	known_original_artifact = known_artifact.get("original_artifact")
	if not known_original_artifact:
	# previous loader-npm version kept original artifact elsewhere
	known_original_artifact = known_artifact.get("package_source")
	if not known_original_artifact:
	continue
	original_hash = known_original_artifact["sha1"]
	else:
	assert isinstance(known_original_artifact, list)
	original_hash = known_original_artifact[0]["checksums"]["sha1"]
	if shasum == original_hash:
	return rev_id
	return None


	-def extract_npm_package_author(package_json) -> Person:
	+def _author_str(author_data: Union[Dict, List, str]) -> str:
	+ """Parse author from package.json author fields
	+
	+ """
	+ if isinstance(author_data, dict):
	+ author_str = ""
	+ name = author_data.get("name")
	+ if name is not None:
	+ if isinstance(name, str):
	+ author_str += name
	+ elif isinstance(name, list):
	+ author_str += _author_str(name[0]) if len(name) > 0 else ""
	+ email = author_data.get("email")
	+ if email is not None:
	+ author_str += f" <{email}>"
	+ result = author_str
	+ elif isinstance(author_data, list):
	+ result = _author_str(author_data[0]) if len(author_data) > 0 else ""
	+ else:
	+ result = author_data
	+ return result
	+
	+
	+def extract_npm_package_author(package_json: Dict[str, Any]) -> Person:
	"""
	Extract package author from a ``package.json`` file content and
	return it in swh format.

	Args:
	- package_json (dict): Dict holding the content of parsed
	+ package_json: Dict holding the content of parsed
	``package.json`` file

	Returns:
	Person

	"""
	-
	- def _author_str(author_data):
	- if type(author_data) is dict:
	- author_str = ""
	- if "name" in author_data:
	- author_str += author_data["name"]
	- if "email" in author_data:
	- author_str += " <%s>" % author_data["email"]
	- return author_str
	- elif type(author_data) is list:
	- return _author_str(author_data[0]) if len(author_data) > 0 else ""
	- else:
	- return author_data
	-
	for author_key in ("author", "authors"):
	if author_key in package_json:
	- author_str = _author_str(package_json[author_key])
	+ author_data = package_json[author_key]
	+ if author_data is None:
	+ return EMPTY_PERSON
	+ author_str = _author_str(author_data)
	return Person.from_fullname(author_str.encode())

	- return Person(fullname=b"", name=None, email=None)
	+ return EMPTY_PERSON


	def _lstrip_bom(s, bom=BOM_UTF8):
	if s.startswith(bom):
	return s[len(bom) :]
	else:
	return s


	def load_json(json_bytes):
	"""
	Try to load JSON from bytes and return a dictionary.

	First try to decode from utf-8. If the decoding failed,
	try to detect the encoding and decode again with replace
	error handling.

	If JSON is malformed, an empty dictionary will be returned.

	Args:
	json_bytes (bytes): binary content of a JSON file

	Returns:
	dict: JSON data loaded in a dictionary
	"""
	json_data = {}
	try:
	json_str = _lstrip_bom(json_bytes).decode("utf-8")
	except UnicodeDecodeError:
	encoding = chardet.detect(json_bytes)["encoding"]
	if encoding:
	json_str = json_bytes.decode(encoding, "replace")
	try:
	json_data = json.loads(json_str)
	except json.decoder.JSONDecodeError:
	pass
	return json_data


	def extract_intrinsic_metadata(dir_path: str) -> Dict:
	"""Given an uncompressed path holding the pkginfo file, returns a
	pkginfo parsed structure as a dict.

	The release artifact contains at their root one folder. For example:
	$ tar tvf zprint-0.0.6.tar.gz
	drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
	...

	Args:

	dir_path (str): Path to the uncompressed directory
	representing a release artifact from npm.

	Returns:
	the pkginfo parsed structure as a dict if any or None if
	none was present.

	"""
	# Retrieve the root folder of the archive
	if not os.path.exists(dir_path):
	return {}
	lst = os.listdir(dir_path)
	if len(lst) == 0:
	return {}
	project_dirname = lst[0]
	package_json_path = os.path.join(dir_path, project_dirname, "package.json")
	if not os.path.exists(package_json_path):
	return {}
	with open(package_json_path, "rb") as package_json_file:
	package_json_bytes = package_json_file.read()
	return load_json(package_json_bytes)
	diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
	index 53a7b28..6e02b73 100644
	--- a/swh/loader/package/npm/tests/test_npm.py
	+++ b/swh/loader/package/npm/tests/test_npm.py
	@@ -1,620 +1,659 @@
	# Copyright (C) 2019-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import json
	import os
	import pytest

	from swh.model.hashutil import hash_to_bytes
	from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType

	from swh.loader.package.npm.loader import (
	+ _author_str,
	NpmLoader,
	extract_npm_package_author,
	artifact_to_revision_id,
	)
	from swh.loader.package.tests.common import check_metadata_paths
	from swh.loader.tests import (
	assert_last_visit_matches,
	check_snapshot,
	get_stats,
	)


	+def test_npm_author_str():
	+ for author, expected_author in [
	+ ("author", "author"),
	+ (
	+ ["Al from quantum leap", "hal from 2001 space odyssey"],
	+ "Al from quantum leap",
	+ ),
	+ ([], ""),
	+ ({"name": "groot", "email": "groot@galaxy.org",}, "groot <groot@galaxy.org>"),
	+ ({"name": "somebody",}, "somebody"),
	+ ({"email": "no@one.org"}, " <no@one.org>"), # note first elt is an extra blank
	+ ({"name": "no one", "email": None,}, "no one"),
	+ ({"email": None,}, ""),
	+ ({"name": None}, ""),
	+ ({"name": None, "email": None,}, ""),
	+ ({}, ""),
	+ (None, None),
	+ ({"name": []}, "",),
	+ (
	+ {"name": ["Susan McSween", "William H. Bonney", "Doc Scurlock",]},
	+ "Susan McSween",
	+ ),
	+ (None, None),
	+ ]:
	+ assert _author_str(author) == expected_author
	+
	+
	def test_extract_npm_package_author(datadir):
	package_metadata_filepath = os.path.join(
	datadir, "https_replicate.npmjs.com", "org_visit1"
	)

	with open(package_metadata_filepath) as json_file:
	package_metadata = json.load(json_file)

	extract_npm_package_author(package_metadata["versions"]["0.0.2"]) == Person(
	fullname=b"mooz <stillpedant@gmail.com>",
	name=b"mooz",
	email=b"stillpedant@gmail.com",
	)

	assert extract_npm_package_author(package_metadata["versions"]["0.0.3"]) == Person(
	fullname=b"Masafumi Oyamada <stillpedant@gmail.com>",
	name=b"Masafumi Oyamada",
	email=b"stillpedant@gmail.com",
	)

	package_json = json.loads(
	"""
	{
	"name": "highlightjs-line-numbers.js",
	"version": "2.7.0",
	"description": "Highlight.js line numbers plugin.",
	"main": "src/highlightjs-line-numbers.js",
	"dependencies": {},
	"devDependencies": {
	"gulp": "^4.0.0",
	"gulp-rename": "^1.4.0",
	"gulp-replace": "^0.6.1",
	"gulp-uglify": "^1.2.0"
	},
	"repository": {
	"type": "git",
	"url": "https://github.com/wcoder/highlightjs-line-numbers.js.git"
	},
	"author": "Yauheni Pakala <evgeniy.pakalo@gmail.com>",
	"license": "MIT",
	"bugs": {
	"url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues"
	},
	"homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/"
	}"""
	- ) # noqa
	+ )

	assert extract_npm_package_author(package_json) == Person(
	fullname=b"Yauheni Pakala <evgeniy.pakalo@gmail.com>",
	name=b"Yauheni Pakala",
	email=b"evgeniy.pakalo@gmail.com",
	)

	package_json = json.loads(
	"""
	{
	"name": "3-way-diff",
	"version": "0.0.1",
	"description": "3-way diffing of JavaScript objects",
	"main": "index.js",
	"authors": [
	{
	"name": "Shawn Walsh",
	"url": "https://github.com/shawnpwalsh"
	},
	{
	"name": "Markham F Rollins IV",
	"url": "https://github.com/mrollinsiv"
	}
	],
	"keywords": [
	"3-way diff",
	"3 way diff",
	"three-way diff",
	"three way diff"
	],
	"devDependencies": {
	"babel-core": "^6.20.0",
	"babel-preset-es2015": "^6.18.0",
	"mocha": "^3.0.2"
	},
	"dependencies": {
	"lodash": "^4.15.0"
	}
	}"""
	)

	assert extract_npm_package_author(package_json) == Person(
	fullname=b"Shawn Walsh", name=b"Shawn Walsh", email=None
	)

	package_json = json.loads(
	"""
	{
	"name": "yfe-ynpm",
	"version": "1.0.0",
	"homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm",
	"repository": {
	"type": "git",
	"url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git"
	},
	"author": [
	"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
	"xufuzi <xufuzi@ywwl.com> (https://7993.org)"
	],
	"license": "MIT"
	}"""
	)

	assert extract_npm_package_author(package_json) == Person(
	fullname=b"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
	name=b"fengmk2",
	email=b"fengmk2@gmail.com",
	)

	package_json = json.loads(
	"""
	{
	"name": "umi-plugin-whale",
	"version": "0.0.8",
	"description": "Internal contract component",
	"authors": {
	"name": "xiaohuoni",
	"email": "448627663@qq.com"
	},
	"repository": "alitajs/whale",
	"devDependencies": {
	"np": "^3.0.4",
	"umi-tools": "*"
	},
	"license": "MIT"
	}"""
	)

	assert extract_npm_package_author(package_json) == Person(
	fullname=b"xiaohuoni <448627663@qq.com>",
	name=b"xiaohuoni",
	email=b"448627663@qq.com",
	)

	+ package_json_no_authors = json.loads(
	+ """{
	+ "authors": null,
	+ "license": "MIT"
	+ }"""
	+ )
	+
	+ assert extract_npm_package_author(package_json_no_authors) == Person(
	+ fullname=b"", name=None, email=None
	+ )
	+

	def normalize_hashes(hashes):
	if isinstance(hashes, str):
	return hash_to_bytes(hashes)
	if isinstance(hashes, list):
	return [hash_to_bytes(x) for x in hashes]
	return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()}


	_expected_new_contents_first_visit = normalize_hashes(
	[
	"4ce3058e16ab3d7e077f65aabf855c34895bf17c",
	"858c3ceee84c8311adc808f8cdb30d233ddc9d18",
	"0fa33b4f5a4e0496da6843a38ff1af8b61541996",
	"85a410f8ef8eb8920f2c384a9555566ad4a2e21b",
	"9163ac8025923d5a45aaac482262893955c9b37b",
	"692cf623b8dd2c5df2c2998fd95ae4ec99882fb4",
	"18c03aac6d3e910efb20039c15d70ab5e0297101",
	"41265c42446aac17ca769e67d1704f99e5a1394d",
	"783ff33f5882813dca9239452c4a7cadd4dba778",
	"b029cfb85107aee4590c2434a3329bfcf36f8fa1",
	"112d1900b4c2e3e9351050d1b542c9744f9793f3",
	"5439bbc4bd9a996f1a38244e6892b71850bc98fd",
	"d83097a2f994b503185adf4e719d154123150159",
	"d0939b4898e83090ee55fd9d8a60e312cfadfbaf",
	"b3523a26f7147e4af40d9d462adaae6d49eda13e",
	"cd065fb435d6fb204a8871bcd623d0d0e673088c",
	"2854a40855ad839a54f4b08f5cff0cf52fca4399",
	"b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe",
	"0f73d56e1cf480bded8a1ecf20ec6fc53c574713",
	"0d9882b2dfafdce31f4e77fe307d41a44a74cefe",
	"585fc5caab9ead178a327d3660d35851db713df1",
	"e8cd41a48d79101977e3036a87aeb1aac730686f",
	"5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7",
	"9c3cc2763bf9e9e37067d3607302c4776502df98",
	"3649a68410e354c83cd4a38b66bd314de4c8f5c9",
	"e96ed0c091de1ebdf587104eaf63400d1974a1fe",
	"078ca03d2f99e4e6eab16f7b75fbb7afb699c86c",
	"38de737da99514de6559ff163c988198bc91367a",
	]
	)

	_expected_new_directories_first_visit = normalize_hashes(
	[
	"3370d20d6f96dc1c9e50f083e2134881db110f4f",
	"42753c0c2ab00c4501b552ac4671c68f3cf5aece",
	"d7895533ef5edbcffdea3f057d9fef3a1ef845ce",
	"80579be563e2ef3e385226fe7a3f079b377f142c",
	"3b0ddc6a9e58b4b53c222da4e27b280b6cda591c",
	"bcad03ce58ac136f26f000990fc9064e559fe1c0",
	"5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca",
	"e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd",
	"584b5b4b6cf7f038095e820b99386a9c232de931",
	"184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a",
	"bb5f4ee143c970367eb409f2e4c1104898048b9d",
	"1b95491047add1103db0dfdfa84a9735dcb11e88",
	"a00c6de13471a2d66e64aca140ddb21ef5521e62",
	"5ce6c1cd5cda2d546db513aaad8c72a44c7771e2",
	"c337091e349b6ac10d38a49cdf8c2401ef9bb0f2",
	"202fafcd7c0f8230e89d5496ad7f44ab12b807bf",
	"775cc516543be86c15c1dc172f49c0d4e6e78235",
	"ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e",
	]
	)

	_expected_new_revisions_first_visit = normalize_hashes(
	{
	"d8a1c7474d2956ac598a19f0f27d52f7015f117e": (
	"42753c0c2ab00c4501b552ac4671c68f3cf5aece"
	),
	"5f9eb78af37ffd12949f235e86fac04898f9f72a": (
	"3370d20d6f96dc1c9e50f083e2134881db110f4f"
	),
	"ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a": (
	"d7895533ef5edbcffdea3f057d9fef3a1ef845ce"
	),
	}
	)


	def package_url(package):
	return "https://www.npmjs.com/package/%s" % package


	def package_metadata_url(package):
	return "https://replicate.npmjs.com/%s/" % package


	def test_revision_metadata_structure(swh_config, requests_mock_datadir):
	package = "org"
	loader = NpmLoader(package_url(package))

	actual_load_status = loader.load()
	assert actual_load_status["status"] == "eventful"
	assert actual_load_status["snapshot_id"] is not None

	expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e")
	revision = list(loader.storage.revision_get([expected_revision_id]))[0]

	assert revision is not None

	check_metadata_paths(
	revision["metadata"],
	paths=[
	("intrinsic.tool", str),
	("intrinsic.raw", dict),
	("extrinsic.provider", str),
	("extrinsic.when", str),
	("extrinsic.raw", dict),
	("original_artifact", list),
	],
	)

	for original_artifact in revision["metadata"]["original_artifact"]:
	check_metadata_paths(
	original_artifact,
	paths=[("filename", str), ("length", int), ("checksums", dict),],
	)


	def test_npm_loader_first_visit(swh_config, requests_mock_datadir):
	package = "org"
	url = package_url(package)
	loader = NpmLoader(url)

	actual_load_status = loader.load()
	expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d")
	assert actual_load_status == {
	"status": "eventful",
	"snapshot_id": expected_snapshot_id.hex(),
	}

	assert_last_visit_matches(
	loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
	)

	stats = get_stats(loader.storage)

	assert {
	"content": len(_expected_new_contents_first_visit),
	"directory": len(_expected_new_directories_first_visit),
	"origin": 1,
	"origin_visit": 1,
	"person": 2,
	"release": 0,
	"revision": len(_expected_new_revisions_first_visit),
	"skipped_content": 0,
	"snapshot": 1,
	} == stats

	assert len(
	list(loader.storage.content_get(_expected_new_contents_first_visit))
	) == len(_expected_new_contents_first_visit)

	assert (
	list(loader.storage.directory_missing(_expected_new_directories_first_visit))
	== []
	)

	assert (
	list(loader.storage.revision_missing(_expected_new_revisions_first_visit)) == []
	)

	expected_snapshot = Snapshot(
	id=expected_snapshot_id,
	branches={
	b"HEAD": SnapshotBranch(
	target=b"releases/0.0.4", target_type=TargetType.ALIAS
	),
	b"releases/0.0.2": SnapshotBranch(
	target=hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e"),
	target_type=TargetType.REVISION,
	),
	b"releases/0.0.3": SnapshotBranch(
	target=hash_to_bytes("5f9eb78af37ffd12949f235e86fac04898f9f72a"),
	target_type=TargetType.REVISION,
	),
	b"releases/0.0.4": SnapshotBranch(
	target=hash_to_bytes("ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a"),
	target_type=TargetType.REVISION,
	),
	},
	)
	check_snapshot(expected_snapshot, loader.storage)


	def test_npm_loader_incremental_visit(swh_config, requests_mock_datadir_visits):
	package = "org"
	url = package_url(package)
	loader = NpmLoader(url)

	expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d")
	actual_load_status = loader.load()
	assert actual_load_status == {
	"status": "eventful",
	"snapshot_id": expected_snapshot_id.hex(),
	}
	assert_last_visit_matches(
	loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
	)

	stats = get_stats(loader.storage)

	assert {
	"content": len(_expected_new_contents_first_visit),
	"directory": len(_expected_new_directories_first_visit),
	"origin": 1,
	"origin_visit": 1,
	"person": 2,
	"release": 0,
	"revision": len(_expected_new_revisions_first_visit),
	"skipped_content": 0,
	"snapshot": 1,
	} == stats

	loader._info = None # reset loader internal state
	actual_load_status2 = loader.load()
	assert actual_load_status2["status"] == "eventful"
	snap_id2 = actual_load_status2["snapshot_id"]
	assert snap_id2 is not None
	assert snap_id2 != actual_load_status["snapshot_id"]

	assert_last_visit_matches(loader.storage, url, status="full", type="npm")

	stats = get_stats(loader.storage)

	assert { # 3 new releases artifacts
	"content": len(_expected_new_contents_first_visit) + 14,
	"directory": len(_expected_new_directories_first_visit) + 15,
	"origin": 1,
	"origin_visit": 2,
	"person": 2,
	"release": 0,
	"revision": len(_expected_new_revisions_first_visit) + 3,
	"skipped_content": 0,
	"snapshot": 2,
	} == stats

	urls = [
	m.url
	for m in requests_mock_datadir_visits.request_history
	if m.url.startswith("https://registry.npmjs.org")
	]
	assert len(urls) == len(set(urls)) # we visited each artifact once across


	@pytest.mark.usefixtures("requests_mock_datadir")
	def test_npm_loader_version_divergence(swh_config):
	package = "@aller_shared"
	url = package_url(package)
	loader = NpmLoader(url)

	actual_load_status = loader.load()
	expected_snapshot_id = hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92")
	assert actual_load_status == {
	"status": "eventful",
	"snapshot_id": expected_snapshot_id.hex(),
	}
	assert_last_visit_matches(
	loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
	)

	stats = get_stats(loader.storage)

	assert { # 1 new releases artifacts
	"content": 534,
	"directory": 153,
	"origin": 1,
	"origin_visit": 1,
	"person": 1,
	"release": 0,
	"revision": 2,
	"skipped_content": 0,
	"snapshot": 1,
	} == stats

	expected_snapshot = Snapshot(
	id=expected_snapshot_id,
	branches={
	b"HEAD": SnapshotBranch(
	target_type=TargetType.ALIAS, target=b"releases/0.1.0"
	),
	b"releases/0.1.0": SnapshotBranch(
	target_type=TargetType.REVISION,
	target=hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"),
	),
	b"releases/0.1.1-alpha.14": SnapshotBranch(
	target_type=TargetType.REVISION,
	target=hash_to_bytes("05181c12cd8c22035dd31155656826b85745da37"),
	),
	},
	)
	check_snapshot(expected_snapshot, loader.storage)


	def test_npm_artifact_to_revision_id_none():
	"""Current loader version should stop soon if nothing can be found

	"""
	artifact_metadata = {
	"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",},
	}

	known_artifacts = {
	"b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": {},
	}

	assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None


	def test_npm_artifact_to_revision_id_old_loader_version():
	"""Current loader version should solve old metadata scheme

	"""
	artifact_metadata = {
	"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",}
	}

	known_artifacts = {
	hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): {
	"package_source": {"sha1": "something-wrong"}
	},
	hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): {
	"package_source": {"sha1": "05181c12cd8c22035dd31155656826b85745da37",}
	},
	}

	assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes(
	"845673bfe8cbd31b1eaf757745a964137e6f9116"
	)


	def test_npm_artifact_to_revision_id_current_loader_version():
	"""Current loader version should be able to solve current metadata scheme

	"""
	artifact_metadata = {
	"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",}
	}

	known_artifacts = {
	hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): {
	"original_artifact": [
	{"checksums": {"sha1": "05181c12cd8c22035dd31155656826b85745da37"},}
	],
	},
	hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): {
	"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],
	},
	}

	assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes(
	"b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"
	)


	def test_npm_artifact_with_no_intrinsic_metadata(swh_config, requests_mock_datadir):
	"""Skip artifact with no intrinsic metadata during ingestion

	"""
	package = "nativescript-telerik-analytics"
	url = package_url(package)
	loader = NpmLoader(url)

	actual_load_status = loader.load()
	# no branch as one artifact without any intrinsic metadata
	expected_snapshot = Snapshot(
	id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={},
	)
	assert actual_load_status == {
	"status": "eventful",
	"snapshot_id": expected_snapshot.id.hex(),
	}

	check_snapshot(expected_snapshot, loader.storage)

	assert_last_visit_matches(
	loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id
	)


	def test_npm_artifact_with_no_upload_time(swh_config, requests_mock_datadir):
	"""With no time upload, artifact is skipped

	"""
	package = "jammit-no-time"
	url = package_url(package)
	loader = NpmLoader(url)

	actual_load_status = loader.load()
	# no branch as one artifact without any intrinsic metadata
	expected_snapshot = Snapshot(
	id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={},
	)
	assert actual_load_status == {
	"status": "uneventful",
	"snapshot_id": expected_snapshot.id.hex(),
	}

	check_snapshot(expected_snapshot, loader.storage)

	assert_last_visit_matches(
	loader.storage, url, status="partial", type="npm", snapshot=expected_snapshot.id
	)


	def test_npm_artifact_use_mtime_if_no_time(swh_config, requests_mock_datadir):
	"""With no time upload, artifact is skipped

	"""
	package = "jammit-express"
	url = package_url(package)
	loader = NpmLoader(url)

	actual_load_status = loader.load()
	expected_snapshot_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd")

	assert actual_load_status == {
	"status": "eventful",
	"snapshot_id": expected_snapshot_id.hex(),
	}

	# artifact is used
	expected_snapshot = Snapshot(
	id=expected_snapshot_id,
	branches={
	b"HEAD": SnapshotBranch(
	target_type=TargetType.ALIAS, target=b"releases/0.0.1"
	),
	b"releases/0.0.1": SnapshotBranch(
	target_type=TargetType.REVISION,
	target=hash_to_bytes("9e4dd2b40d1b46b70917c0949aa2195c823a648e"),
	),
	},
	)
	check_snapshot(expected_snapshot, loader.storage)

	assert_last_visit_matches(
	loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id
	)


	def test_npm_no_artifact(swh_config, requests_mock_datadir):
	"""If no artifacts at all is found for origin, the visit fails completely

	"""
	package = "catify"
	url = package_url(package)
	loader = NpmLoader(url)
	actual_load_status = loader.load()
	assert actual_load_status == {
	"status": "failed",
	}

	assert_last_visit_matches(loader.storage, url, status="partial", type="npm")

File Metadata

Mime Type: text/x-diff
Expires: Sat, Jun 21, 5:52 PM (1 w, 6 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3252717

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions