diff --git a/PKG-INFO b/PKG-INFO index 208999e..a92a4b5 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,56 +1,56 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 2.6.0 +Version: 2.6.1 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/pytest.ini b/pytest.ini index 276cddc..478264b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] -norecursedirs = docs .* +norecursedirs = build docs .* markers = db: marks tests as using a db (deselect with '-m "not db"') fs: marks tests as using the filesystem (deselect with '-m "not fs"') diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index 208999e..a92a4b5 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,56 +1,56 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 2.6.0 +Version: 2.6.1 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/SOURCES.txt b/swh.loader.core.egg-info/SOURCES.txt index dbf32c7..7a9d27f 100644 --- a/swh.loader.core.egg-info/SOURCES.txt +++ b/swh.loader.core.egg-info/SOURCES.txt @@ -1,216 +1,218 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py docs/index.rst docs/package-loader-specifications.rst docs/package-loader-tutorial.rst docs/vcs-loader-overview.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.core.egg-info/PKG-INFO swh.loader.core.egg-info/SOURCES.txt swh.loader.core.egg-info/dependency_links.txt swh.loader.core.egg-info/entry_points.txt swh.loader.core.egg-info/requires.txt swh.loader.core.egg-info/top_level.txt swh/loader/__init__.py swh/loader/cli.py swh/loader/exception.py swh/loader/pytest_plugin.py swh/loader/core/__init__.py swh/loader/core/converters.py swh/loader/core/loader.py swh/loader/core/py.typed swh/loader/core/utils.py swh/loader/core/tests/__init__.py swh/loader/core/tests/test_converters.py swh/loader/core/tests/test_loader.py swh/loader/core/tests/test_utils.py swh/loader/package/__init__.py swh/loader/package/loader.py swh/loader/package/py.typed swh/loader/package/utils.py swh/loader/package/archive/__init__.py swh/loader/package/archive/loader.py swh/loader/package/archive/tasks.py swh/loader/package/archive/tests/__init__.py swh/loader/package/archive/tests/test_archive.py swh/loader/package/archive/tests/test_tasks.py swh/loader/package/archive/tests/data/not_gzipped_tarball.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/cran/__init__.py swh/loader/package/cran/loader.py swh/loader/package/cran/tasks.py swh/loader/package/cran/tests/__init__.py swh/loader/package/cran/tests/test_cran.py swh/loader/package/cran/tests/test_tasks.py swh/loader/package/cran/tests/data/description/KnownBR swh/loader/package/cran/tests/data/description/acepack swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz swh/loader/package/debian/__init__.py swh/loader/package/debian/loader.py swh/loader/package/debian/tasks.py swh/loader/package/debian/tests/__init__.py swh/loader/package/debian/tests/test_debian.py swh/loader/package/debian/tests/test_tasks.py swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz swh/loader/package/debian/tests/data/http_deb.debian.org/onefile.txt swh/loader/package/deposit/__init__.py swh/loader/package/deposit/loader.py swh/loader/package/deposit/tasks.py swh/loader/package/deposit/tests/__init__.py swh/loader/package/deposit/tests/conftest.py swh/loader/package/deposit/tests/test_deposit.py swh/loader/package/deposit/tests/test_tasks.py swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.10.zip swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.12.tar.gz swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json swh/loader/package/maven/__init__.py swh/loader/package/maven/loader.py swh/loader/package/maven/tasks.py swh/loader/package/maven/tests/__init__.py swh/loader/package/maven/tests/test_maven.py swh/loader/package/maven/tests/test_tasks.py swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/loader/package/nixguix/__init__.py swh/loader/package/nixguix/loader.py swh/loader/package/nixguix/tasks.py swh/loader/package/nixguix/tests/__init__.py swh/loader/package/nixguix/tests/conftest.py swh/loader/package/nixguix/tests/test_nixguix.py swh/loader/package/nixguix/tests/test_tasks.py swh/loader/package/nixguix/tests/data/https_example.com/file.txt swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 swh/loader/package/npm/__init__.py swh/loader/package/npm/loader.py swh/loader/package/npm/tasks.py swh/loader/package/npm/tests/__init__.py swh/loader/package/npm/tests/test_npm.py swh/loader/package/npm/tests/test_tasks.py swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/jammit-express_-_jammit-express-0.0.1.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/nativescript-telerik-analytics_-_nativescript-telerik-analytics-1.0.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz +swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz swh/loader/package/npm/tests/data/https_replicate.npmjs.com/@aller_shared swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-express swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-no-time swh/loader/package/npm/tests/data/https_replicate.npmjs.com/nativescript-telerik-analytics swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org +swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_visit1 swh/loader/package/opam/__init__.py swh/loader/package/opam/loader.py swh/loader/package/opam/tasks.py swh/loader/package/opam/tests/__init__.py swh/loader/package/opam/tests/test_opam.py swh/loader/package/opam/tests/test_tasks.py swh/loader/package/opam/tests/data/fake_opam_repo/_repo swh/loader/package/opam/tests/data/fake_opam_repo/version swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/lock swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/repos-config swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/agrid/agrid.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.2/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.3/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/ocb/ocb.0.1/opam swh/loader/package/opam/tests/data/https_github.com/OCamlPro_agrid_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.2.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.3.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_ocb_archive_0.1.tar.gz swh/loader/package/pypi/__init__.py swh/loader/package/pypi/loader.py swh/loader/package/pypi/tasks.py swh/loader/package/pypi/tests/__init__.py swh/loader/package/pypi/tests/test_pypi.py swh/loader/package/pypi/tests/test_tasks.py swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_86_10_c9555ec63106153aaaad753a281ff47f4ac79e980ff7f5d740d6649cd56a_upymenu-0.0.1.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_upymenu_json swh/loader/package/tests/__init__.py swh/loader/package/tests/common.py swh/loader/package/tests/test_conftest.py swh/loader/package/tests/test_loader.py swh/loader/package/tests/test_loader_metadata.py swh/loader/package/tests/test_utils.py swh/loader/tests/__init__.py swh/loader/tests/conftest.py swh/loader/tests/py.typed swh/loader/tests/test_cli.py swh/loader/tests/test_init.py swh/loader/tests/data/0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index 964b880..794f33d 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,381 +1,381 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from datetime import timezone import json import logging from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union import attr import requests from swh.core.config import load_from_envvar from swh.loader.core.loader import DEFAULT_CONFIG from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import cached_method, download from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) def now() -> datetime.datetime: return datetime.datetime.now(tz=timezone.utc) @attr.s class DepositPackageInfo(BasePackageInfo): filename = attr.ib(type=str) # instead of Optional[str] author_date = attr.ib(type=datetime.datetime) """codemeta:dateCreated if any, deposit completed_date otherwise""" commit_date = attr.ib(type=datetime.datetime) """codemeta:datePublished if any, deposit completed_date otherwise""" client = attr.ib(type=str) id = attr.ib(type=int) """Internal ID of the deposit in the deposit DB""" collection = attr.ib(type=str) """The collection in the deposit; see SWORD specification.""" author = attr.ib(type=Person) committer = attr.ib(type=Person) release_notes = attr.ib(type=Optional[str]) @classmethod def from_metadata( cls, metadata: Dict[str, Any], url: str, filename: str, version: str ) -> "DepositPackageInfo": # Note: # `date` and `committer_date` are always transmitted by the deposit read api # which computes itself the values. The loader needs to use those to create the # release. - metadata_raw: str = metadata["metadata_raw"] + raw_metadata: str = metadata["raw_metadata"] depo = metadata["deposit"] return cls( url=url, filename=filename, version=version, author_date=depo["author_date"], commit_date=depo["committer_date"], client=depo["client"], id=depo["id"], collection=depo["collection"], author=parse_author(depo["author"]), committer=parse_author(depo["committer"]), release_notes=depo["release_notes"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( discovery_date=now(), - metadata=metadata_raw.encode(), + metadata=raw_metadata.encode(), format="sword-v2-atom-codemeta-v2", ) ], ) def extid(self) -> None: # For now, we don't try to deduplicate deposits. There is little point anyway, # as it only happens when the exact same tarball was deposited twice. return None class DepositLoader(PackageLoader[DepositPackageInfo]): """Load a deposited artifact into swh archive. """ visit_type = "deposit" def __init__( self, storage: StorageInterface, url: str, deposit_id: str, deposit_client: "ApiClient", max_content_size: Optional[int] = None, default_filename: str = "archive.tar", ): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity deposit_client: Deposit api client """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.deposit_id = deposit_id self.client = deposit_client self.default_filename = default_filename @classmethod def from_configfile(cls, **kwargs: Any): """Instantiate a loader from the configuration loaded from the SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their value is not None. Args: kwargs: kwargs passed to the loader instantiation """ config = dict(load_from_envvar(DEFAULT_CONFIG)) config.update({k: v for k, v in kwargs.items() if v is not None}) deposit_client = ApiClient(**config.pop("deposit")) return cls.from_config(deposit_client=deposit_client, **config) def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ["HEAD"] def get_metadata_authority(self) -> MetadataAuthority: provider = self.metadata()["provider"] assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value return MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider["provider_url"], metadata={ "name": provider["provider_name"], **(provider["metadata"] or {}), }, ) def get_metadata_fetcher(self) -> MetadataFetcher: tool = self.metadata()["tool"] return MetadataFetcher( name=tool["name"], version=tool["version"], metadata=tool["configuration"], ) def get_package_info( self, version: str ) -> Iterator[Tuple[str, DepositPackageInfo]]: p_info = DepositPackageInfo.from_metadata( self.metadata(), url=self.url, filename=self.default_filename, version=version, ) yield "HEAD", p_info def download_package( self, p_info: DepositPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] def build_release( self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git, ) -> Optional[Release]: message = ( f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}" ) if p_info.release_notes: message += "\n\n" + p_info.release_notes if not message.endswith("\n"): message += "\n" return Release( name=p_info.version.encode(), message=message.encode(), author=p_info.author, date=TimestampWithTimezone.from_dict(p_info.author_date), target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: metadata = self.metadata() - metadata_raw: str = metadata["metadata_raw"] + raw_metadata: str = metadata["raw_metadata"] origin_metadata = json.dumps( { - "metadata": [metadata_raw], + "metadata": [raw_metadata], "provider": metadata["provider"], "tool": metadata["tool"], } ).encode() return [ RawExtrinsicMetadataCore( discovery_date=now(), - metadata=metadata_raw.encode(), + metadata=raw_metadata.encode(), format="sword-v2-atom-codemeta-v2", ), RawExtrinsicMetadataCore( discovery_date=now(), metadata=origin_metadata, format="original-artifacts-json", ), ] @cached_method def metadata(self): """Returns metadata from the deposit server""" return self.client.metadata_get(self.deposit_id) def load(self) -> Dict: # First making sure the deposit is known on the deposit's RPC server # prior to trigger a loading try: self.metadata() except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} # Then usual loading return super().load() def finalize_visit( self, status_visit: str, errors: Optional[List[str]] = None, **kwargs ) -> Dict[str, Any]: r = super().finalize_visit(status_visit=status_visit, **kwargs) success = status_visit == "full" # Update deposit status try: if not success: self.client.status_update( self.deposit_id, status="failed", errors=errors, ) return r snapshot_id = hash_to_bytes(r["snapshot_id"]) snapshot = snapshot_get_all_branches(self.storage, snapshot_id) if not snapshot: return r branches = snapshot.branches logger.debug("branches: %s", branches) if not branches: return r rel_id = branches[b"HEAD"].target release = self.storage.release_get([rel_id])[0] if not release: return r # update the deposit's status to success with its # release-id and directory-id self.client.status_update( self.deposit_id, status="done", release_id=hash_to_hex(rel_id), directory_id=hash_to_hex(release.target), snapshot_id=r["snapshot_id"], origin_url=self.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") return {"status": "failed"} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip("/") self.auth = None if not auth else (auth["username"], auth["password"]) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs["auth"] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str ) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f"{self.base_url}/{deposit_id}/raw/" return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f"{self.base_url}/{deposit_id}/meta/" def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do("get", url) if r.ok: return r.json() msg = f"Problem when retrieving deposit metadata at {url}" logger.error(msg) raise ValueError(msg) def status_update( self, deposit_id: Union[int, str], status: str, errors: Optional[List[str]] = None, release_id: Optional[str] = None, directory_id: Optional[str] = None, snapshot_id: Optional[str] = None, origin_url: Optional[str] = None, ): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f"{self.base_url}/{deposit_id}/update/" payload: Dict[str, Any] = {"status": status} if release_id: payload["release_id"] = release_id if directory_id: payload["directory_id"] = directory_id if snapshot_id: payload["snapshot_id"] = snapshot_id if origin_url: payload["origin_url"] = origin_url if errors: payload["status_detail"] = {"loading": errors} self.do("put", url, json=payload) diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta index 758fdd2..b76dc9e 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "metadata_raw" : "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one", + "raw_metadata" : "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": "666", "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta index 8b46bcd..1a2c258 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 777, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta index 30cc188..fbf3272 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/hal-123456", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 888, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta index bad1d1d..62de3de 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/hal-123456", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 999, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": "This release adds this and that." } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json index 758fdd2..b76dc9e 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "metadata_raw" : "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one", + "raw_metadata" : "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": "666", "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json index 8b46bcd..1a2c258 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 777, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json index 30cc188..fbf3272 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/hal-123456", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 888, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": null } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json index bad1d1d..62de3de 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json @@ -1,51 +1,51 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/hal-123456", "type": "deposit" }, - "metadata_raw": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", + "raw_metadata": "some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", "provider": { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": null }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2" } }, "deposit": { "id": 999, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0 }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0 }, "revision_parents": [], "release_notes": "This release adds this and that." } } diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index 7ee0e4b..64476a4 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,557 +1,557 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import re import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import ApiClient, DepositLoader from swh.loader.package.loader import now from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Origin, Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType DEPOSIT_URL = "https://deposit.softwareheritage.org/1/private" @pytest.fixture def requests_mock_datadir(requests_mock_datadir): """Enhance default mock data to mock put requests as the loader does some internal update queries there. """ requests_mock_datadir.put(re.compile("https")) return requests_mock_datadir def test_deposit_init_ok(swh_storage, deposit_client, swh_loader_config): url = "some-url" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] def test_deposit_from_configfile(swh_config): """Ensure the deposit instantiation is ok """ loader = DepositLoader.from_configfile( url="some-url", deposit_id="666", default_filename="archive.zip" ) assert isinstance(loader.client, ApiClient) def test_deposit_loading_unknown_deposit( swh_storage, deposit_client, requests_mock_datadir ): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 loader = DepositLoader( swh_storage, url, unknown_deposit_id, deposit_client, default_filename="archive.zip", ) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 0, "origin_visit": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[f"{DEPOSIT_URL}/666/raw/",] ) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_storage, deposit_client, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 requests_mock_datadir_missing_one.put(re.compile("https")) loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(loader.storage, url, status="partial", type="deposit") stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir_missing_one.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "failed", "status_detail": { "loading": [ "Failed to load branch HEAD for some-url-2: Fail to query " "'https://deposit.softwareheritage.org/1/private/666/raw/'. Reason: 404" ] }, } assert body == expected_body def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch(target=release_id, target_type=TargetType.RELEASE,), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=b"hal: Deposit 666 in collection hal\n", author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) # check metadata fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check origin metadata orig_meta = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert orig_meta.next_page_token is None raw_meta = loader.client.metadata_get(deposit_id) - metadata_raw: str = raw_meta["metadata_raw"] + raw_metadata: str = raw_meta["raw_metadata"] # 2 raw metadata xml + 1 json dict assert len(orig_meta.results) == 2 orig_meta0 = orig_meta.results[0] assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher # Check directory metadata assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=release.target ) actual_dir_meta = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_dir_meta.next_page_token is None assert len(actual_dir_meta.results) == 1 dir_meta = actual_dir_meta.results[0] assert dir_meta.authority == authority assert dir_meta.fetcher == fetcher - assert dir_meta.metadata.decode() == metadata_raw + assert dir_meta.metadata.decode() == raw_metadata # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id_hex, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body stats = get_stats(loader.storage) assert { "content": 303, "directory": 12, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_deposit_loading_ok_2(swh_storage, deposit_client, requests_mock_datadir): """Field dates should be se appropriately """ external_id = "some-external-id" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "3449b8ff31abeacefd33cca60e3074c1649dc3a1" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id = "ba6c9a59ae3256e765d32b211cc183dc2380aed7" expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes(release_id), target_type=TargetType.RELEASE ) }, ) check_snapshot(expected_snapshot, storage=loader.storage) raw_meta = loader.client.metadata_get(deposit_id) # Ensure the date fields are set appropriately in the release # Retrieve the release release = loader.storage.release_get([hash_to_bytes(release_id)])[0] assert release # swh-deposit uses the numeric 'offset_minutes' instead of the bytes offset # attribute, because its dates are always well-formed, and it can only send # JSON-serializable data. release_date_dict = { "timestamp": release.date.timestamp.to_dict(), "offset": release.date.offset_minutes(), } assert release_date_dict == raw_meta["deposit"]["author_date"] assert not release.metadata provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, } fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check the origin metadata swh side origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert origin_extrinsic_metadata.next_page_token is None - metadata_raw: str = raw_meta["metadata_raw"] + raw_metadata: str = raw_meta["raw_metadata"] # 1 raw metadata xml + 1 json dict assert len(origin_extrinsic_metadata.results) == 2 origin_swhid = Origin(url).swhid() expected_metadata = [] origin_meta = origin_extrinsic_metadata.results[0] expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_meta.discovery_date, - metadata=metadata_raw.encode(), + metadata=raw_metadata.encode(), format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, ) ) origin_metadata = { - "metadata": [metadata_raw], + "metadata": [raw_metadata], "provider": provider, "tool": tool, } expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_extrinsic_metadata.results[-1].discovery_date, metadata=json.dumps(origin_metadata).encode(), format="original-artifacts-json", authority=authority, fetcher=fetcher, ) ) assert sorted(origin_extrinsic_metadata.results) == sorted(expected_metadata) # Check the release metadata swh side assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_directory_metadata.next_page_token is None assert len(actual_directory_metadata.results) == 1 release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id) ) dir_metadata_template = RawExtrinsicMetadata( target=directory_swhid, format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, origin=url, release=release_swhid, # to satisfy the constructor discovery_date=now(), metadata=b"", ) expected_directory_metadata = [] dir_metadata = actual_directory_metadata.results[0] expected_directory_metadata.append( RawExtrinsicMetadata.from_dict( { **{ k: v for (k, v) in dir_metadata_template.to_dict().items() if k != "id" }, "discovery_date": dir_metadata.discovery_date, - "metadata": metadata_raw.encode(), + "metadata": raw_metadata.encode(), } ) ) assert sorted(actual_directory_metadata.results) == sorted( expected_directory_metadata ) # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body def test_deposit_loading_ok_3(swh_storage, deposit_client, requests_mock_datadir): """Deposit loading can happen on tarball artifacts as well The latest deposit changes introduce the internal change. """ external_id = "hal-123456" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 888 loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() expected_snapshot_id = "4677843de89e398f1d6bfedc9ca9b89c451c55c8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) def test_deposit_loading_ok_release_notes( swh_storage, deposit_client, requests_mock_datadir ): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "a307acffb7c29bebb3daf1bcb680bb3f452890a8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "f5e8ec02ede57edbe061afa7fc2a07bb7d14a700" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch(target=release_id, target_type=TargetType.RELEASE,), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=( b"hal: Deposit 999 in collection hal\n\nThis release adds this and that.\n" ), author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index a302cbf..b082a0f 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,296 +1,309 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os +import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, - PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name -from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EMPTY_PERSON = Person.from_fullname(b"") -EXTID_TYPE = "npm-archive-sha1" -EXTID_VERSION = 0 - - @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) + package_name = attr.ib(type=str) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" + # we cannot rely only on $shasum, as it is technically possible for two versions + # of the same package to have the exact same tarball. + # But the release data (message and date) are extrinsic to the content of the + # package, so they differ between versions. + # So we need every attribute used to build the release object to be part of the + # manifest. + MANIFEST_FORMAT = string.Template( + "date $date\nname $package_name\nshasum $shasum\nurl $url\nversion $version" + ) + EXTID_TYPE = "npm-manifest-sha256" + EXTID_VERSION = 0 + @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] + assert package_metadata["name"] == project_metadata["name"] + # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( + package_name=package_metadata["name"], url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) - def extid(self) -> PartialExtID: - return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.shasum)) - class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. """ visit_type = "npm" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(self.package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: + # Metadata from NPM is not intrinsic to tarballs. + # This means two package versions can have the same tarball, but different + # metadata. To avoid mixing up releases, every field used to build the + # release object must be part of NpmPackageInfo.MANIFEST_FORMAT. i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) + assert self.package_name == p_info.package_name msg = ( - f"Synthetic release for NPM source package {self.package_name} " + f"Synthetic release for NPM source package {p_info.package_name} " f"version {p_info.version}\n" ) if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields """ if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz new file mode 100644 index 0000000..bc20daa Binary files /dev/null and b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz differ diff --git a/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch new file mode 100644 index 0000000..fc08add --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch @@ -0,0 +1,141 @@ +{ + "_id": "org_version_mismatch", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.3" + }, + "versions": { + "0.0.3-beta": { + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.3-beta", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.3-beta", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3-beta.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44Z", + "created": "2014-01-01T15:40:31Z", + "0.0.3-beta": "2014-01-01T15:40:33Z", + "0.0.3": "2014-01-01T15:55:45Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index d00b9a9..63e5924 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,641 +1,729 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import os import pytest from swh.loader.package import __version__ from swh.loader.package.npm.loader import ( NpmLoader, _author_str, extract_npm_package_author, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult @pytest.fixture def org_api_info(datadir) -> bytes: with open(os.path.join(datadir, "https_replicate.npmjs.com", "org"), "rb",) as f: return f.read() def test_npm_author_str(): for author, expected_author in [ ("author", "author"), ( ["Al from quantum leap", "hal from 2001 space odyssey"], "Al from quantum leap", ), ([], ""), ({"name": "groot", "email": "groot@galaxy.org",}, "groot "), ({"name": "somebody",}, "somebody"), ({"email": "no@one.org"}, " "), # note first elt is an extra blank ({"name": "no one", "email": None,}, "no one"), ({"email": None,}, ""), ({"name": None}, ""), ({"name": None, "email": None,}, ""), ({}, ""), (None, None), ({"name": []}, "",), ( {"name": ["Susan McSween", "William H. Bonney", "Doc Scurlock",]}, "Susan McSween", ), (None, None), ]: assert _author_str(author) == expected_author def test_npm_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, "https_replicate.npmjs.com", "org_visit1" ) with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata["versions"]["0.0.2"]) == Person( fullname=b"mooz ", name=b"mooz", email=b"stillpedant@gmail.com", ) assert extract_npm_package_author(package_metadata["versions"]["0.0.3"]) == Person( fullname=b"Masafumi Oyamada ", name=b"Masafumi Oyamada", email=b"stillpedant@gmail.com", ) package_json = json.loads( """ { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Yauheni Pakala ", name=b"Yauheni Pakala", email=b"evgeniy.pakalo@gmail.com", ) package_json = json.loads( """ { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Shawn Walsh", name=b"Shawn Walsh", email=None ) package_json = json.loads( """ { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"fengmk2 (https://fengmk2.com)", name=b"fengmk2", email=b"fengmk2@gmail.com", ) package_json = json.loads( """ { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"xiaohuoni <448627663@qq.com>", name=b"xiaohuoni", email=b"448627663@qq.com", ) package_json_no_authors = json.loads( """{ "authors": null, "license": "MIT" }""" ) assert extract_npm_package_author(package_json_no_authors) == Person.from_fullname( b"" ) def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes( [ "4ce3058e16ab3d7e077f65aabf855c34895bf17c", "858c3ceee84c8311adc808f8cdb30d233ddc9d18", "0fa33b4f5a4e0496da6843a38ff1af8b61541996", "85a410f8ef8eb8920f2c384a9555566ad4a2e21b", "9163ac8025923d5a45aaac482262893955c9b37b", "692cf623b8dd2c5df2c2998fd95ae4ec99882fb4", "18c03aac6d3e910efb20039c15d70ab5e0297101", "41265c42446aac17ca769e67d1704f99e5a1394d", "783ff33f5882813dca9239452c4a7cadd4dba778", "b029cfb85107aee4590c2434a3329bfcf36f8fa1", "112d1900b4c2e3e9351050d1b542c9744f9793f3", "5439bbc4bd9a996f1a38244e6892b71850bc98fd", "d83097a2f994b503185adf4e719d154123150159", "d0939b4898e83090ee55fd9d8a60e312cfadfbaf", "b3523a26f7147e4af40d9d462adaae6d49eda13e", "cd065fb435d6fb204a8871bcd623d0d0e673088c", "2854a40855ad839a54f4b08f5cff0cf52fca4399", "b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe", "0f73d56e1cf480bded8a1ecf20ec6fc53c574713", "0d9882b2dfafdce31f4e77fe307d41a44a74cefe", "585fc5caab9ead178a327d3660d35851db713df1", "e8cd41a48d79101977e3036a87aeb1aac730686f", "5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7", "9c3cc2763bf9e9e37067d3607302c4776502df98", "3649a68410e354c83cd4a38b66bd314de4c8f5c9", "e96ed0c091de1ebdf587104eaf63400d1974a1fe", "078ca03d2f99e4e6eab16f7b75fbb7afb699c86c", "38de737da99514de6559ff163c988198bc91367a", ] ) _expected_new_directories_first_visit = normalize_hashes( [ "3370d20d6f96dc1c9e50f083e2134881db110f4f", "42753c0c2ab00c4501b552ac4671c68f3cf5aece", "d7895533ef5edbcffdea3f057d9fef3a1ef845ce", "80579be563e2ef3e385226fe7a3f079b377f142c", "3b0ddc6a9e58b4b53c222da4e27b280b6cda591c", "bcad03ce58ac136f26f000990fc9064e559fe1c0", "5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca", "e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd", "584b5b4b6cf7f038095e820b99386a9c232de931", "184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a", "bb5f4ee143c970367eb409f2e4c1104898048b9d", "1b95491047add1103db0dfdfa84a9735dcb11e88", "a00c6de13471a2d66e64aca140ddb21ef5521e62", "5ce6c1cd5cda2d546db513aaad8c72a44c7771e2", "c337091e349b6ac10d38a49cdf8c2401ef9bb0f2", "202fafcd7c0f8230e89d5496ad7f44ab12b807bf", "775cc516543be86c15c1dc172f49c0d4e6e78235", "ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e", ] ) _expected_new_releases_first_visit = normalize_hashes( { "d38cc0b571cd41f3c85513864e049766b42032a7": ( "42753c0c2ab00c4501b552ac4671c68f3cf5aece" ), "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295": ( "3370d20d6f96dc1c9e50f083e2134881db110f4f" ), "6e976db82f6c310596b21fb0ed8b11f507631434": ( "d7895533ef5edbcffdea3f057d9fef3a1ef845ce" ), } ) def package_url(package): return "https://www.npmjs.com/package/%s" % package def package_metadata_url(package): return "https://replicate.npmjs.com/%s/" % package def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) release_id = "d38cc0b571cd41f3c85513864e049766b42032a7" versions = [ ("0.0.2", release_id), ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"), ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.4", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.2", message=b"Synthetic release for NPM source package org version 0.0.2\n", target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"mooz ", name=b"mooz", email=b"stillpedant@gmail.com", ), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == [] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, release_id) in versions: release = swh_storage.release_get([hash_to_bytes(release_id)])[0] assert release.target_type == ModelObjectType.DIRECTORY directory_id = release.target directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id, ) release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id), ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, ), discovery_date=loader.visit_date, format="replicate-npm-package-json", metadata=json.dumps( json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_npm_loader_incremental_visit(swh_storage, requests_mock_datadir_visits): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") actual_load_status = loader.load() assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # reset loader internal state del loader._cached_info del loader._cached__raw_info actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snap_id2 = actual_load_status2["snapshot_id"] assert snap_id2 is not None assert snap_id2 != actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="npm") stats = get_stats(swh_storage) assert { # 3 new releases artifacts "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 15, "origin": 1, "origin_visit": 2, "release": len(_expected_new_releases_first_visit) + 3, "revision": 0, "skipped_content": 0, "snapshot": 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://registry.npmjs.org") ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures("requests_mock_datadir") def test_npm_loader_version_divergence(swh_storage): - package = "@aller_shared" + package = "@aller/shared" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("ebbe6397d0c2a6cf7cba40fa5b043c59dd4f2497") + expected_snapshot_id = hash_to_bytes("68eed3d3bc852e7f435a84f18ee77e23f6884be2") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0" ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("04c66f3a82aa001e8f1b45246b58b82d2b0ca0df"), + target=hash_to_bytes("0c486b50b407f847ef7581f595c2b6c2062f1089"), ), b"releases/0.1.1-alpha.14": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("90cc04dc72193f3b1444f10e1c525bee2ea9dac6"), + target=hash_to_bytes("79d80c87c0a8d104a216cc539baad962a454802a"), ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { # 1 new releases artifacts "content": 534, "directory": 153, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats +def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir): + """Test with two versions that have exactly the same tarball""" + package = "org_version_mismatch" + url = package_url(package) + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a") + assert actual_load_status == { + "status": "eventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1" + release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0" + versions = [ + ("0.0.3-beta", beta_release_id), + ("0.0.3", release_id), + ] + + expected_snapshot = Snapshot( + id=expected_snapshot_id, + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/0.0.3", target_type=TargetType.ALIAS + ), + **{ + b"releases/" + + version_name.encode(): SnapshotBranch( + target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, + ) + for (version_name, version_id) in versions + }, + }, + ) + check_snapshot(expected_snapshot, swh_storage) + + assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release( + name=b"0.0.3-beta", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3-beta\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(beta_release_id), + ) + + assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( + name=b"0.0.3", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(release_id), + ) + + # Check incremental re-load keeps it unchanged + + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + assert actual_load_status == { + "status": "uneventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = "nativescript-telerik-analytics" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-no-time" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-express" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("33b8f105d48ce16b6c59158af660e0cc78bcbef4") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # artifact is used expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.0.1" ), b"releases/0.0.1": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes("3e3b800570869fa9b3dbc302500553e62400cc06"), ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_no_artifact(swh_storage, requests_mock_datadir): """If no artifacts at all is found for origin, the visit fails completely """ package = "catify" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "failed", } assert_last_visit_matches(swh_storage, url, status="failed", type="npm") def test_npm_origin_not_found(swh_storage, requests_mock_datadir): url = package_url("non-existent-url") loader = NpmLoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="npm", snapshot=None )