Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123652
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
48 KB
Subscribers
None
View Options
diff --git a/PKG-INFO b/PKG-INFO
index 11b4266..67e16d6 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,32 +1,32 @@
Metadata-Version: 2.1
Name: swh.loader.core
-Version: 0.6.0
+Version: 0.6.1
Summary: Software Heritage Base Loader
Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/
Description: SWH-loader-core
===============
The Software Heritage Core Loader is a low-level loading utilities and
helpers used by other loaders.
The main entry points are classes:
- :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn)
- :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...)
- :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...)
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO
index 11b4266..67e16d6 100644
--- a/swh.loader.core.egg-info/PKG-INFO
+++ b/swh.loader.core.egg-info/PKG-INFO
@@ -1,32 +1,32 @@
Metadata-Version: 2.1
Name: swh.loader.core
-Version: 0.6.0
+Version: 0.6.1
Summary: Software Heritage Base Loader
Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/
Description: SWH-loader-core
===============
The Software Heritage Core Loader is a low-level loading utilities and
helpers used by other loaders.
The main entry points are classes:
- :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn)
- :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...)
- :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...)
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
index afa682c..8c4f8c4 100644
--- a/swh/loader/core/loader.py
+++ b/swh/loader/core/loader.py
@@ -1,430 +1,430 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import hashlib
import logging
import os
from abc import ABCMeta, abstractmethod
from typing import Any, Dict, Iterable, Optional, Tuple
from swh.core import config
from swh.model.model import (
BaseContent,
Content,
SkippedContent,
Directory,
Origin,
OriginVisit,
OriginVisitStatus,
Revision,
Release,
Sha1Git,
Snapshot,
)
from swh.storage import get_storage
from swh.storage.utils import now
class BaseLoader(config.SWHConfig, metaclass=ABCMeta):
"""Mixin base class for loader.
To use this class, you must:
- inherit from this class
- and implement the @abstractmethod methods:
- :func:`prepare`: First step executed by the loader to prepare some
state needed by the `func`:load method.
- :func:`get_origin`: Retrieve the origin that is currently being loaded.
- :func:`fetch_data`: Fetch the data is actually the method to implement
to compute data to inject in swh (through the store_data method)
- :func:`store_data`: Store data fetched.
- :func:`visit_status`: Explicit status of the visit ('partial' or
'full')
- :func:`load_status`: Explicit status of the loading, for use by the
scheduler (eventful/uneventful/temporary failure/permanent failure).
- :func:`cleanup`: Last step executed by the loader.
The entry point for the resulting loader is :func:`load`.
You can take a look at some example classes:
- :class:`BaseSvnLoader`
"""
CONFIG_BASE_FILENAME = None # type: Optional[str]
DEFAULT_CONFIG = {
"storage": ("dict", {"cls": "remote", "url": "http://localhost:5002/",}),
"max_content_size": ("int", 100 * 1024 * 1024),
"save_data": ("bool", False),
"save_data_path": ("str", ""),
} # type: Dict[str, Tuple[str, Any]]
ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]
def __init__(
self, logging_class: Optional[str] = None, config: Dict[str, Any] = {}
):
if config:
self.config = config
else:
self.config = self.parse_config_file(
additional_configs=[self.ADDITIONAL_CONFIG]
)
self.storage = get_storage(**self.config["storage"])
if logging_class is None:
logging_class = "%s.%s" % (
self.__class__.__module__,
self.__class__.__name__,
)
self.log = logging.getLogger(logging_class)
_log = logging.getLogger("requests.packages.urllib3.connectionpool")
_log.setLevel(logging.WARN)
self.max_content_size = self.config["max_content_size"]
# possibly overridden in self.prepare method
self.visit_date: Optional[datetime.datetime] = None
self.origin: Optional[Origin] = None
if not hasattr(self, "visit_type"):
self.visit_type: Optional[str] = None
self.origin_metadata: Dict[str, Any] = {}
self.loaded_snapshot_id: Optional[Sha1Git] = None
# Make sure the config is sane
save_data = self.config.get("save_data")
if save_data:
path = self.config["save_data_path"]
os.stat(path)
if not os.access(path, os.R_OK | os.W_OK):
raise PermissionError("Permission denied: %r" % path)
def save_data(self) -> None:
"""Save the data associated to the current load"""
raise NotImplementedError
def get_save_data_path(self) -> str:
"""The path to which we archive the loader's raw data"""
if not hasattr(self, "__save_data_path"):
year = str(self.visit_date.year) # type: ignore
assert self.origin
url = self.origin.url.encode("utf-8")
origin_url_hash = hashlib.sha1(url).hexdigest()
path = "%s/sha1:%s/%s/%s" % (
self.config["save_data_path"],
origin_url_hash[0:2],
origin_url_hash,
year,
)
os.makedirs(path, exist_ok=True)
self.__save_data_path = path
return self.__save_data_path
def flush(self) -> None:
"""Flush any potential buffered data not sent to swh-storage.
"""
self.storage.flush()
@abstractmethod
def cleanup(self) -> None:
"""Last step executed by the loader.
"""
pass
@abstractmethod
def prepare_origin_visit(self, *args, **kwargs) -> None:
"""First step executed by the loader to prepare origin and visit
references. Set/update self.origin, and
optionally self.origin_url, self.visit_date.
"""
pass
def _store_origin_visit(self) -> None:
"""Store origin and visit references. Sets the self.visit references.
"""
assert self.origin
self.storage.origin_add([self.origin])
if not self.visit_date: # now as default visit_date if not provided
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
assert isinstance(self.visit_date, datetime.datetime)
assert isinstance(self.visit_type, str)
self.visit = self.storage.origin_visit_add(
[
OriginVisit(
origin=self.origin.url, date=self.visit_date, type=self.visit_type,
)
]
)[0]
@abstractmethod
def prepare(self, *args, **kwargs) -> None:
"""Second step executed by the loader to prepare some state needed by
the loader.
"""
pass
def get_origin(self) -> Origin:
"""Get the origin that is currently being loaded.
self.origin should be set in :func:`prepare_origin`
Returns:
dict: an origin ready to be sent to storage by
- :func:`origin_add_one`.
+ :func:`origin_add`.
"""
assert self.origin
return self.origin
@abstractmethod
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
(ex: git/hg/svn/... repository).
Returns:
a value that is interpreted as a boolean. If True, fetch_data needs
to be called again to complete loading.
"""
pass
@abstractmethod
def store_data(self):
"""Store fetched data in the database.
Should call the :func:`maybe_load_xyz` methods, which handle the
bundles sent to storage, rather than send directly.
"""
pass
def store_metadata(self) -> None:
"""Store fetched metadata in the database.
For more information, see implementation in :class:`DepositLoader`.
"""
pass
def load_status(self) -> Dict[str, str]:
"""Detailed loading status.
Defaults to logging an eventful load.
Returns: a dictionary that is eventually passed back as the task's
result to the scheduler, allowing tuning of the task recurrence
mechanism.
"""
return {
"status": "eventful",
}
def post_load(self, success: bool = True) -> None:
"""Permit the loader to do some additional actions according to status
after the loading is done. The flag success indicates the
loading's status.
Defaults to doing nothing.
This is up to the implementer of this method to make sure this
does not break.
Args:
success (bool): the success status of the loading
"""
pass
def visit_status(self) -> str:
"""Detailed visit status.
Defaults to logging a full visit.
"""
return "full"
def pre_cleanup(self) -> None:
"""As a first step, will try and check for dangling data to cleanup.
This should do its best to avoid raising issues.
"""
pass
def load(self, *args, **kwargs) -> Dict[str, str]:
r"""Loading logic for the loader to follow:
- 1. Call :meth:`prepare_origin_visit` to prepare the
origin and visit we will associate loading data to
- 2. Store the actual ``origin_visit`` to storage
- 3. Call :meth:`prepare` to prepare any eventual state
- 4. Call :meth:`get_origin` to get the origin we work with and store
- while True:
- 5. Call :meth:`fetch_data` to fetch the data to store
- 6. Call :meth:`store_data` to store the data
- 7. Call :meth:`cleanup` to clean up any eventual state put in place
in :meth:`prepare` method.
"""
try:
self.pre_cleanup()
except Exception:
msg = "Cleaning up dangling data failed! Continue loading."
self.log.warning(msg)
self.prepare_origin_visit(*args, **kwargs)
self._store_origin_visit()
assert self.origin
try:
self.prepare(*args, **kwargs)
while True:
more_data_to_fetch = self.fetch_data()
self.store_data()
if not more_data_to_fetch:
break
self.store_metadata()
visit_status = OriginVisitStatus(
origin=self.origin.url,
visit=self.visit.visit,
date=now(),
status=self.visit_status(),
snapshot=self.loaded_snapshot_id,
)
self.storage.origin_visit_status_add([visit_status])
self.post_load()
except Exception:
self.log.exception(
"Loading failure, updating to `partial` status",
extra={"swh_task_args": args, "swh_task_kwargs": kwargs,},
)
visit_status = OriginVisitStatus(
origin=self.origin.url,
visit=self.visit.visit,
date=now(),
status="partial",
snapshot=self.loaded_snapshot_id,
)
self.storage.origin_visit_status_add([visit_status])
self.post_load(success=False)
return {"status": "failed"}
finally:
self.flush()
self.cleanup()
return self.load_status()
class DVCSLoader(BaseLoader):
"""This base class is a pattern for dvcs loaders (e.g. git, mercurial).
Those loaders are able to load all the data in one go. For example, the
loader defined in swh-loader-git :class:`BulkUpdater`.
For other loaders (stateful one, (e.g :class:`SWHSvnLoader`),
inherit directly from :class:`BaseLoader`.
"""
ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]
def cleanup(self) -> None:
"""Clean up an eventual state installed for computations."""
pass
def has_contents(self) -> bool:
"""Checks whether we need to load contents"""
return True
def get_contents(self) -> Iterable[BaseContent]:
"""Get the contents that need to be loaded"""
raise NotImplementedError
def has_directories(self) -> bool:
"""Checks whether we need to load directories"""
return True
def get_directories(self) -> Iterable[Directory]:
"""Get the directories that need to be loaded"""
raise NotImplementedError
def has_revisions(self) -> bool:
"""Checks whether we need to load revisions"""
return True
def get_revisions(self) -> Iterable[Revision]:
"""Get the revisions that need to be loaded"""
raise NotImplementedError
def has_releases(self) -> bool:
"""Checks whether we need to load releases"""
return True
def get_releases(self) -> Iterable[Release]:
"""Get the releases that need to be loaded"""
raise NotImplementedError
def get_snapshot(self) -> Snapshot:
"""Get the snapshot that needs to be loaded"""
raise NotImplementedError
def eventful(self) -> bool:
"""Whether the load was eventful"""
raise NotImplementedError
def store_data(self) -> None:
assert self.origin
if self.config.get("save_data"):
self.save_data()
if self.has_contents():
contents = []
skipped_contents = []
for obj in self.get_contents():
if isinstance(obj, Content):
contents.append(obj)
elif isinstance(obj, SkippedContent):
skipped_contents.append(obj)
else:
raise TypeError(f"Unexpected content type: {obj}")
self.storage.skipped_content_add(skipped_contents)
self.storage.content_add(contents)
if self.has_directories():
self.storage.directory_add(self.get_directories())
if self.has_revisions():
self.storage.revision_add(self.get_revisions())
if self.has_releases():
self.storage.release_add(self.get_releases())
snapshot = self.get_snapshot()
self.storage.snapshot_add([snapshot])
self.flush()
self.loaded_snapshot_id = snapshot.id
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
index b6eb5e0..5591edc 100644
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -1,287 +1,302 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
import os
from codecs import BOM_UTF8
-from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
+from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union
import attr
import chardet
from urllib.parse import quote
from swh.model.model import (
Person,
RevisionType,
Revision,
TimestampWithTimezone,
Sha1Git,
)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import api_info, release_name
logger = logging.getLogger(__name__)
+EMPTY_PERSON = Person(fullname=b"", name=None, email=None)
+
+
class NpmLoader(PackageLoader):
"""Load npm origin's artifact releases into swh archive.
"""
visit_type = "npm"
def __init__(self, url: str):
"""Constructor
Args
str: origin url (e.g. https://www.npmjs.com/package/<package-name>)
"""
super().__init__(url=url)
package_name = url.split("https://www.npmjs.com/package/")[1]
safe_name = quote(package_name, safe="")
self.provider_url = f"https://replicate.npmjs.com/{safe_name}/"
self._info: Dict[str, Any] = {}
self._versions = None
@property
def info(self) -> Dict[str, Any]:
"""Return the project metadata information (fetched from npm registry)
"""
if not self._info:
self._info = api_info(self.provider_url)
return self._info
def get_versions(self) -> Sequence[str]:
return sorted(list(self.info["versions"].keys()))
def get_default_version(self) -> str:
return self.info["dist-tags"].get("latest", "")
def get_package_info(
self, version: str
) -> Generator[Tuple[str, Mapping[str, Any]], None, None]:
meta = self.info["versions"][version]
url = meta["dist"]["tarball"]
p_info = {
"url": url,
"filename": os.path.basename(url),
"raw": meta,
}
yield release_name(version), p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict
) -> Optional[bytes]:
return artifact_to_revision_id(known_artifacts, artifact_metadata)
def build_revision(
self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
i_metadata = extract_intrinsic_metadata(uncompressed_path)
if not i_metadata:
return None
# from intrinsic metadata
author = extract_npm_package_author(i_metadata)
message = i_metadata["version"].encode("ascii")
# from extrinsic metadata
# No date available in intrinsic metadata: retrieve it from the API
# metadata, using the version number that the API claims this package
# has.
extrinsic_version = a_metadata["version"]
if "time" in self.info:
date = self.info["time"][extrinsic_version]
elif "mtime" in a_metadata:
date = a_metadata["mtime"]
else:
artifact_name = os.path.basename(a_metadata["dist"]["tarball"])
raise ValueError(
"Origin %s: Cannot determine upload time for artifact %s."
% (self.url, artifact_name)
)
date = TimestampWithTimezone.from_iso8601(date)
# FIXME: this is to remain bug-compatible with earlier versions:
date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0))
r = Revision(
type=RevisionType.TAR,
message=message,
author=author,
date=date,
committer=author,
committer_date=date,
parents=(),
directory=directory,
synthetic=True,
metadata={
"intrinsic": {"tool": "package.json", "raw": i_metadata,},
"extrinsic": {
"provider": self.provider_url,
"when": self.visit_date.isoformat(),
"raw": a_metadata,
},
},
)
return r
def artifact_to_revision_id(
known_artifacts: Dict, artifact_metadata: Dict
) -> Optional[bytes]:
"""Given metadata artifact, solves the associated revision id.
The following code allows to deal with 2 metadata formats:
- old format sample::
{
'package_source': {
'sha1': '05181c12cd8c22035dd31155656826b85745da37',
}
}
- new format sample::
{
'original_artifact': [{
'checksums': {
'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
...
},
}],
...
}
"""
shasum = artifact_metadata["dist"]["shasum"]
for rev_id, known_artifact in known_artifacts.items():
known_original_artifact = known_artifact.get("original_artifact")
if not known_original_artifact:
# previous loader-npm version kept original artifact elsewhere
known_original_artifact = known_artifact.get("package_source")
if not known_original_artifact:
continue
original_hash = known_original_artifact["sha1"]
else:
assert isinstance(known_original_artifact, list)
original_hash = known_original_artifact[0]["checksums"]["sha1"]
if shasum == original_hash:
return rev_id
return None
-def extract_npm_package_author(package_json) -> Person:
+def _author_str(author_data: Union[Dict, List, str]) -> str:
+ """Parse author from package.json author fields
+
+ """
+ if isinstance(author_data, dict):
+ author_str = ""
+ name = author_data.get("name")
+ if name is not None:
+ if isinstance(name, str):
+ author_str += name
+ elif isinstance(name, list):
+ author_str += _author_str(name[0]) if len(name) > 0 else ""
+ email = author_data.get("email")
+ if email is not None:
+ author_str += f" <{email}>"
+ result = author_str
+ elif isinstance(author_data, list):
+ result = _author_str(author_data[0]) if len(author_data) > 0 else ""
+ else:
+ result = author_data
+ return result
+
+
+def extract_npm_package_author(package_json: Dict[str, Any]) -> Person:
"""
Extract package author from a ``package.json`` file content and
return it in swh format.
Args:
- package_json (dict): Dict holding the content of parsed
+ package_json: Dict holding the content of parsed
``package.json`` file
Returns:
Person
"""
-
- def _author_str(author_data):
- if type(author_data) is dict:
- author_str = ""
- if "name" in author_data:
- author_str += author_data["name"]
- if "email" in author_data:
- author_str += " <%s>" % author_data["email"]
- return author_str
- elif type(author_data) is list:
- return _author_str(author_data[0]) if len(author_data) > 0 else ""
- else:
- return author_data
-
for author_key in ("author", "authors"):
if author_key in package_json:
- author_str = _author_str(package_json[author_key])
+ author_data = package_json[author_key]
+ if author_data is None:
+ return EMPTY_PERSON
+ author_str = _author_str(author_data)
return Person.from_fullname(author_str.encode())
- return Person(fullname=b"", name=None, email=None)
+ return EMPTY_PERSON
def _lstrip_bom(s, bom=BOM_UTF8):
if s.startswith(bom):
return s[len(bom) :]
else:
return s
def load_json(json_bytes):
"""
Try to load JSON from bytes and return a dictionary.
First try to decode from utf-8. If the decoding failed,
try to detect the encoding and decode again with replace
error handling.
If JSON is malformed, an empty dictionary will be returned.
Args:
json_bytes (bytes): binary content of a JSON file
Returns:
dict: JSON data loaded in a dictionary
"""
json_data = {}
try:
json_str = _lstrip_bom(json_bytes).decode("utf-8")
except UnicodeDecodeError:
encoding = chardet.detect(json_bytes)["encoding"]
if encoding:
json_str = json_bytes.decode(encoding, "replace")
try:
json_data = json.loads(json_str)
except json.decoder.JSONDecodeError:
pass
return json_data
def extract_intrinsic_metadata(dir_path: str) -> Dict:
"""Given an uncompressed path holding the pkginfo file, returns a
pkginfo parsed structure as a dict.
The release artifact contains at their root one folder. For example:
$ tar tvf zprint-0.0.6.tar.gz
drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
...
Args:
dir_path (str): Path to the uncompressed directory
representing a release artifact from npm.
Returns:
the pkginfo parsed structure as a dict if any or None if
none was present.
"""
# Retrieve the root folder of the archive
if not os.path.exists(dir_path):
return {}
lst = os.listdir(dir_path)
if len(lst) == 0:
return {}
project_dirname = lst[0]
package_json_path = os.path.join(dir_path, project_dirname, "package.json")
if not os.path.exists(package_json_path):
return {}
with open(package_json_path, "rb") as package_json_file:
package_json_bytes = package_json_file.read()
return load_json(package_json_bytes)
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
index 53a7b28..6e02b73 100644
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -1,620 +1,659 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import os
import pytest
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType
from swh.loader.package.npm.loader import (
+ _author_str,
NpmLoader,
extract_npm_package_author,
artifact_to_revision_id,
)
from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import (
assert_last_visit_matches,
check_snapshot,
get_stats,
)
+def test_npm_author_str():
+ for author, expected_author in [
+ ("author", "author"),
+ (
+ ["Al from quantum leap", "hal from 2001 space odyssey"],
+ "Al from quantum leap",
+ ),
+ ([], ""),
+ ({"name": "groot", "email": "groot@galaxy.org",}, "groot <groot@galaxy.org>"),
+ ({"name": "somebody",}, "somebody"),
+ ({"email": "no@one.org"}, " <no@one.org>"), # note first elt is an extra blank
+ ({"name": "no one", "email": None,}, "no one"),
+ ({"email": None,}, ""),
+ ({"name": None}, ""),
+ ({"name": None, "email": None,}, ""),
+ ({}, ""),
+ (None, None),
+ ({"name": []}, "",),
+ (
+ {"name": ["Susan McSween", "William H. Bonney", "Doc Scurlock",]},
+ "Susan McSween",
+ ),
+ (None, None),
+ ]:
+ assert _author_str(author) == expected_author
+
+
def test_extract_npm_package_author(datadir):
package_metadata_filepath = os.path.join(
datadir, "https_replicate.npmjs.com", "org_visit1"
)
with open(package_metadata_filepath) as json_file:
package_metadata = json.load(json_file)
extract_npm_package_author(package_metadata["versions"]["0.0.2"]) == Person(
fullname=b"mooz <stillpedant@gmail.com>",
name=b"mooz",
email=b"stillpedant@gmail.com",
)
assert extract_npm_package_author(package_metadata["versions"]["0.0.3"]) == Person(
fullname=b"Masafumi Oyamada <stillpedant@gmail.com>",
name=b"Masafumi Oyamada",
email=b"stillpedant@gmail.com",
)
package_json = json.loads(
"""
{
"name": "highlightjs-line-numbers.js",
"version": "2.7.0",
"description": "Highlight.js line numbers plugin.",
"main": "src/highlightjs-line-numbers.js",
"dependencies": {},
"devDependencies": {
"gulp": "^4.0.0",
"gulp-rename": "^1.4.0",
"gulp-replace": "^0.6.1",
"gulp-uglify": "^1.2.0"
},
"repository": {
"type": "git",
"url": "https://github.com/wcoder/highlightjs-line-numbers.js.git"
},
"author": "Yauheni Pakala <evgeniy.pakalo@gmail.com>",
"license": "MIT",
"bugs": {
"url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues"
},
"homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/"
}"""
- ) # noqa
+ )
assert extract_npm_package_author(package_json) == Person(
fullname=b"Yauheni Pakala <evgeniy.pakalo@gmail.com>",
name=b"Yauheni Pakala",
email=b"evgeniy.pakalo@gmail.com",
)
package_json = json.loads(
"""
{
"name": "3-way-diff",
"version": "0.0.1",
"description": "3-way diffing of JavaScript objects",
"main": "index.js",
"authors": [
{
"name": "Shawn Walsh",
"url": "https://github.com/shawnpwalsh"
},
{
"name": "Markham F Rollins IV",
"url": "https://github.com/mrollinsiv"
}
],
"keywords": [
"3-way diff",
"3 way diff",
"three-way diff",
"three way diff"
],
"devDependencies": {
"babel-core": "^6.20.0",
"babel-preset-es2015": "^6.18.0",
"mocha": "^3.0.2"
},
"dependencies": {
"lodash": "^4.15.0"
}
}"""
)
assert extract_npm_package_author(package_json) == Person(
fullname=b"Shawn Walsh", name=b"Shawn Walsh", email=None
)
package_json = json.loads(
"""
{
"name": "yfe-ynpm",
"version": "1.0.0",
"homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm",
"repository": {
"type": "git",
"url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git"
},
"author": [
"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
"xufuzi <xufuzi@ywwl.com> (https://7993.org)"
],
"license": "MIT"
}"""
)
assert extract_npm_package_author(package_json) == Person(
fullname=b"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
name=b"fengmk2",
email=b"fengmk2@gmail.com",
)
package_json = json.loads(
"""
{
"name": "umi-plugin-whale",
"version": "0.0.8",
"description": "Internal contract component",
"authors": {
"name": "xiaohuoni",
"email": "448627663@qq.com"
},
"repository": "alitajs/whale",
"devDependencies": {
"np": "^3.0.4",
"umi-tools": "*"
},
"license": "MIT"
}"""
)
assert extract_npm_package_author(package_json) == Person(
fullname=b"xiaohuoni <448627663@qq.com>",
name=b"xiaohuoni",
email=b"448627663@qq.com",
)
+ package_json_no_authors = json.loads(
+ """{
+ "authors": null,
+ "license": "MIT"
+ }"""
+ )
+
+ assert extract_npm_package_author(package_json_no_authors) == Person(
+ fullname=b"", name=None, email=None
+ )
+
def normalize_hashes(hashes):
if isinstance(hashes, str):
return hash_to_bytes(hashes)
if isinstance(hashes, list):
return [hash_to_bytes(x) for x in hashes]
return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()}
_expected_new_contents_first_visit = normalize_hashes(
[
"4ce3058e16ab3d7e077f65aabf855c34895bf17c",
"858c3ceee84c8311adc808f8cdb30d233ddc9d18",
"0fa33b4f5a4e0496da6843a38ff1af8b61541996",
"85a410f8ef8eb8920f2c384a9555566ad4a2e21b",
"9163ac8025923d5a45aaac482262893955c9b37b",
"692cf623b8dd2c5df2c2998fd95ae4ec99882fb4",
"18c03aac6d3e910efb20039c15d70ab5e0297101",
"41265c42446aac17ca769e67d1704f99e5a1394d",
"783ff33f5882813dca9239452c4a7cadd4dba778",
"b029cfb85107aee4590c2434a3329bfcf36f8fa1",
"112d1900b4c2e3e9351050d1b542c9744f9793f3",
"5439bbc4bd9a996f1a38244e6892b71850bc98fd",
"d83097a2f994b503185adf4e719d154123150159",
"d0939b4898e83090ee55fd9d8a60e312cfadfbaf",
"b3523a26f7147e4af40d9d462adaae6d49eda13e",
"cd065fb435d6fb204a8871bcd623d0d0e673088c",
"2854a40855ad839a54f4b08f5cff0cf52fca4399",
"b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe",
"0f73d56e1cf480bded8a1ecf20ec6fc53c574713",
"0d9882b2dfafdce31f4e77fe307d41a44a74cefe",
"585fc5caab9ead178a327d3660d35851db713df1",
"e8cd41a48d79101977e3036a87aeb1aac730686f",
"5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7",
"9c3cc2763bf9e9e37067d3607302c4776502df98",
"3649a68410e354c83cd4a38b66bd314de4c8f5c9",
"e96ed0c091de1ebdf587104eaf63400d1974a1fe",
"078ca03d2f99e4e6eab16f7b75fbb7afb699c86c",
"38de737da99514de6559ff163c988198bc91367a",
]
)
_expected_new_directories_first_visit = normalize_hashes(
[
"3370d20d6f96dc1c9e50f083e2134881db110f4f",
"42753c0c2ab00c4501b552ac4671c68f3cf5aece",
"d7895533ef5edbcffdea3f057d9fef3a1ef845ce",
"80579be563e2ef3e385226fe7a3f079b377f142c",
"3b0ddc6a9e58b4b53c222da4e27b280b6cda591c",
"bcad03ce58ac136f26f000990fc9064e559fe1c0",
"5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca",
"e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd",
"584b5b4b6cf7f038095e820b99386a9c232de931",
"184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a",
"bb5f4ee143c970367eb409f2e4c1104898048b9d",
"1b95491047add1103db0dfdfa84a9735dcb11e88",
"a00c6de13471a2d66e64aca140ddb21ef5521e62",
"5ce6c1cd5cda2d546db513aaad8c72a44c7771e2",
"c337091e349b6ac10d38a49cdf8c2401ef9bb0f2",
"202fafcd7c0f8230e89d5496ad7f44ab12b807bf",
"775cc516543be86c15c1dc172f49c0d4e6e78235",
"ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e",
]
)
_expected_new_revisions_first_visit = normalize_hashes(
{
"d8a1c7474d2956ac598a19f0f27d52f7015f117e": (
"42753c0c2ab00c4501b552ac4671c68f3cf5aece"
),
"5f9eb78af37ffd12949f235e86fac04898f9f72a": (
"3370d20d6f96dc1c9e50f083e2134881db110f4f"
),
"ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a": (
"d7895533ef5edbcffdea3f057d9fef3a1ef845ce"
),
}
)
def package_url(package):
return "https://www.npmjs.com/package/%s" % package
def package_metadata_url(package):
return "https://replicate.npmjs.com/%s/" % package
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
package = "org"
loader = NpmLoader(package_url(package))
actual_load_status = loader.load()
assert actual_load_status["status"] == "eventful"
assert actual_load_status["snapshot_id"] is not None
expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e")
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(
revision["metadata"],
paths=[
("intrinsic.tool", str),
("intrinsic.raw", dict),
("extrinsic.provider", str),
("extrinsic.when", str),
("extrinsic.raw", dict),
("original_artifact", list),
],
)
for original_artifact in revision["metadata"]["original_artifact"]:
check_metadata_paths(
original_artifact,
paths=[("filename", str), ("length", int), ("checksums", dict),],
)
def test_npm_loader_first_visit(swh_config, requests_mock_datadir):
package = "org"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d")
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id.hex(),
}
assert_last_visit_matches(
loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
)
stats = get_stats(loader.storage)
assert {
"content": len(_expected_new_contents_first_visit),
"directory": len(_expected_new_directories_first_visit),
"origin": 1,
"origin_visit": 1,
"person": 2,
"release": 0,
"revision": len(_expected_new_revisions_first_visit),
"skipped_content": 0,
"snapshot": 1,
} == stats
assert len(
list(loader.storage.content_get(_expected_new_contents_first_visit))
) == len(_expected_new_contents_first_visit)
assert (
list(loader.storage.directory_missing(_expected_new_directories_first_visit))
== []
)
assert (
list(loader.storage.revision_missing(_expected_new_revisions_first_visit)) == []
)
expected_snapshot = Snapshot(
id=expected_snapshot_id,
branches={
b"HEAD": SnapshotBranch(
target=b"releases/0.0.4", target_type=TargetType.ALIAS
),
b"releases/0.0.2": SnapshotBranch(
target=hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e"),
target_type=TargetType.REVISION,
),
b"releases/0.0.3": SnapshotBranch(
target=hash_to_bytes("5f9eb78af37ffd12949f235e86fac04898f9f72a"),
target_type=TargetType.REVISION,
),
b"releases/0.0.4": SnapshotBranch(
target=hash_to_bytes("ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a"),
target_type=TargetType.REVISION,
),
},
)
check_snapshot(expected_snapshot, loader.storage)
def test_npm_loader_incremental_visit(swh_config, requests_mock_datadir_visits):
package = "org"
url = package_url(package)
loader = NpmLoader(url)
expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d")
actual_load_status = loader.load()
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id.hex(),
}
assert_last_visit_matches(
loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
)
stats = get_stats(loader.storage)
assert {
"content": len(_expected_new_contents_first_visit),
"directory": len(_expected_new_directories_first_visit),
"origin": 1,
"origin_visit": 1,
"person": 2,
"release": 0,
"revision": len(_expected_new_revisions_first_visit),
"skipped_content": 0,
"snapshot": 1,
} == stats
loader._info = None # reset loader internal state
actual_load_status2 = loader.load()
assert actual_load_status2["status"] == "eventful"
snap_id2 = actual_load_status2["snapshot_id"]
assert snap_id2 is not None
assert snap_id2 != actual_load_status["snapshot_id"]
assert_last_visit_matches(loader.storage, url, status="full", type="npm")
stats = get_stats(loader.storage)
assert { # 3 new releases artifacts
"content": len(_expected_new_contents_first_visit) + 14,
"directory": len(_expected_new_directories_first_visit) + 15,
"origin": 1,
"origin_visit": 2,
"person": 2,
"release": 0,
"revision": len(_expected_new_revisions_first_visit) + 3,
"skipped_content": 0,
"snapshot": 2,
} == stats
urls = [
m.url
for m in requests_mock_datadir_visits.request_history
if m.url.startswith("https://registry.npmjs.org")
]
assert len(urls) == len(set(urls)) # we visited each artifact once across
@pytest.mark.usefixtures("requests_mock_datadir")
def test_npm_loader_version_divergence(swh_config):
package = "@aller_shared"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
expected_snapshot_id = hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92")
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id.hex(),
}
assert_last_visit_matches(
loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id
)
stats = get_stats(loader.storage)
assert { # 1 new releases artifacts
"content": 534,
"directory": 153,
"origin": 1,
"origin_visit": 1,
"person": 1,
"release": 0,
"revision": 2,
"skipped_content": 0,
"snapshot": 1,
} == stats
expected_snapshot = Snapshot(
id=expected_snapshot_id,
branches={
b"HEAD": SnapshotBranch(
target_type=TargetType.ALIAS, target=b"releases/0.1.0"
),
b"releases/0.1.0": SnapshotBranch(
target_type=TargetType.REVISION,
target=hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"),
),
b"releases/0.1.1-alpha.14": SnapshotBranch(
target_type=TargetType.REVISION,
target=hash_to_bytes("05181c12cd8c22035dd31155656826b85745da37"),
),
},
)
check_snapshot(expected_snapshot, loader.storage)
def test_npm_artifact_to_revision_id_none():
"""Current loader version should stop soon if nothing can be found
"""
artifact_metadata = {
"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",},
}
known_artifacts = {
"b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": {},
}
assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None
def test_npm_artifact_to_revision_id_old_loader_version():
"""Current loader version should solve old metadata scheme
"""
artifact_metadata = {
"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",}
}
known_artifacts = {
hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): {
"package_source": {"sha1": "something-wrong"}
},
hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): {
"package_source": {"sha1": "05181c12cd8c22035dd31155656826b85745da37",}
},
}
assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes(
"845673bfe8cbd31b1eaf757745a964137e6f9116"
)
def test_npm_artifact_to_revision_id_current_loader_version():
"""Current loader version should be able to solve current metadata scheme
"""
artifact_metadata = {
"dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",}
}
known_artifacts = {
hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): {
"original_artifact": [
{"checksums": {"sha1": "05181c12cd8c22035dd31155656826b85745da37"},}
],
},
hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): {
"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],
},
}
assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes(
"b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"
)
def test_npm_artifact_with_no_intrinsic_metadata(swh_config, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion
"""
package = "nativescript-telerik-analytics"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
# no branch as one artifact without any intrinsic metadata
expected_snapshot = Snapshot(
id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={},
)
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot.id.hex(),
}
check_snapshot(expected_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id
)
def test_npm_artifact_with_no_upload_time(swh_config, requests_mock_datadir):
"""With no time upload, artifact is skipped
"""
package = "jammit-no-time"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
# no branch as one artifact without any intrinsic metadata
expected_snapshot = Snapshot(
id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={},
)
assert actual_load_status == {
"status": "uneventful",
"snapshot_id": expected_snapshot.id.hex(),
}
check_snapshot(expected_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage, url, status="partial", type="npm", snapshot=expected_snapshot.id
)
def test_npm_artifact_use_mtime_if_no_time(swh_config, requests_mock_datadir):
"""With no time upload, artifact is skipped
"""
package = "jammit-express"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
expected_snapshot_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd")
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id.hex(),
}
# artifact is used
expected_snapshot = Snapshot(
id=expected_snapshot_id,
branches={
b"HEAD": SnapshotBranch(
target_type=TargetType.ALIAS, target=b"releases/0.0.1"
),
b"releases/0.0.1": SnapshotBranch(
target_type=TargetType.REVISION,
target=hash_to_bytes("9e4dd2b40d1b46b70917c0949aa2195c823a648e"),
),
},
)
check_snapshot(expected_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id
)
def test_npm_no_artifact(swh_config, requests_mock_datadir):
"""If no artifacts at all is found for origin, the visit fails completely
"""
package = "catify"
url = package_url(package)
loader = NpmLoader(url)
actual_load_status = loader.load()
assert actual_load_status == {
"status": "failed",
}
assert_last_visit_matches(loader.storage, url, status="partial", type="npm")
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jun 21, 5:52 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3252717
Attached To
rDLDBASE Generic VCS/Package Loader
Event Timeline
Log In to Comment