diff --git a/swh/loader/cli.py b/swh/loader/cli.py --- a/swh/loader/cli.py +++ b/swh/loader/cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -48,16 +48,35 @@ logger.debug(f"registry: {registry_entry}") loader_cls = registry_entry["loader"] logger.debug(f"loader class: {loader_cls}") - return loader_cls(**kwargs) + return loader_cls.from_config(**kwargs) @swh_cli_group.group(name="loader", context_settings=CONTEXT_SETTINGS) +@click.option( + "--config-file", + "-C", + default=None, + type=click.Path(exists=True, dir_okay=False,), + help="Configuration file.", +) @click.pass_context -def loader(ctx): +def loader(ctx, config_file): """Loader cli tools """ - pass + from os import environ + + from swh.core.config import read + + ctx.ensure_object(dict) + logger.debug("ctx: %s", ctx) + + if not config_file: + config_file = environ.get("SWH_CONFIG_FILENAME") + + ctx.obj["config"] = read(config_file) + logger.debug("config_file: %s", config_file) + logger.debug("config: ", ctx.obj["config"]) @loader.command(name="run", context_settings=CONTEXT_SETTINGS) @@ -71,6 +90,10 @@ from swh.scheduler.cli.utils import parse_options + conf = ctx.obj.get("config", {}) + if "storage" not in conf: + raise ValueError("Missing storage configuration key") + (_, kw) = parse_options(options) logger.debug(f"kw: {kw}") visit_date = kw.get("visit_date") @@ -78,7 +101,7 @@ visit_date = iso8601.parse_date(visit_date) kw["visit_date"] = visit_date - loader = get_loader(type, url=url, **kw) + loader = get_loader(type, url=url, storage=conf["storage"], **kw) result = loader.load() click.echo(result) diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from abc import ABCMeta, abstractmethod import datetime import hashlib import logging @@ -26,24 +25,82 @@ Snapshot, ) from swh.storage import get_storage +from swh.storage.interface import StorageInterface from swh.storage.utils import now DEFAULT_CONFIG: Dict[str, Any] = { "max_content_size": 100 * 1024 * 1024, - "save_data": False, - "save_data_path": "", - "storage": {"cls": "memory"}, } -class BaseLoader(metaclass=ABCMeta): - """Mixin base class for loader. +class Loader: + """The base class for a Software Heritage Loader. - To use this class, you must: + A loader retrieves origin information (git/mercurial/svn repositories, pypi/npm/... + package artifacts), ingests the contents/directories/revisions/releases/snapshot to + the storage backend. + + For now, this just exposes 2 static methods (from_config, from_configfile) to + centralize and ease the loader instantiation. + + Args: + storage: the instance of the Storage being used to register the + origin information + + """ + + def __init__( + self, storage: StorageInterface, max_content_size: Optional[int] = None, + ): + self.storage = storage + self.max_content_size = int(max_content_size) if max_content_size else None + + @classmethod + def from_config(cls, storage: Dict[str, Any], **config: Any): + """Instantiate a loader from a configuration dict. + + This is basically a backwards-compatibility shim for the CLI. + + Args: + storage: instantiation config for the storage + config: the configuration dict for the loader, with the following keys: + - credentials (optional): credentials list for the scheduler + - any other kwargs passed to the loader. + + Returns: + the instantiated loader + """ + # Drop the legacy config keys which aren't used for this generation of loader. + for legacy_key in ("storage", "celery"): + config.pop(legacy_key, None) + + # Instantiate the storage + storage_instance = get_storage(**storage) + return cls(storage=storage_instance, **config) + + @classmethod + def from_configfile(cls, **kwargs: Any): + """Instantiate a loader from the configuration loaded from the + SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their + value is not None. + + Args: + kwargs: kwargs passed to the loader instantiation + + """ + config = dict(load_from_envvar(DEFAULT_CONFIG)) + config.update({k: v for k, v in kwargs.items() if v is not None}) + return cls.from_config(**config) + + +class BaseLoader(Loader): + """Mixin base class for (D)VCS loaders (e.g svn, git, mercurial, ...). + + To define such loaders, you must: - inherit from this class - - and implement the @abstractmethod methods: + - and implement following methods: - :func:`prepare`: First step executed by the loader to prepare some state needed by the `func`:load method. @@ -73,15 +130,12 @@ def __init__( self, + storage: StorageInterface, logging_class: Optional[str] = None, - config: Optional[Dict[str, Any]] = None, + save_data_path: Optional[str] = None, + max_content_size: Optional[int] = None, ): - if config: - self.config = config - else: - self.config = load_from_envvar(DEFAULT_CONFIG) - - self.storage = get_storage(**self.config["storage"]) + super().__init__(storage=storage, max_content_size=max_content_size) if logging_class is None: logging_class = "%s.%s" % ( @@ -93,28 +147,24 @@ _log = logging.getLogger("requests.packages.urllib3.connectionpool") _log.setLevel(logging.WARN) - self.max_content_size = self.config["max_content_size"] - # possibly overridden in self.prepare method self.visit_date: Optional[datetime.datetime] = None - self.origin: Optional[Origin] = None if not hasattr(self, "visit_type"): self.visit_type: Optional[str] = None self.origin_metadata: Dict[str, Any] = {} - self.loaded_snapshot_id: Optional[Sha1Git] = None - # Make sure the config is sane - save_data = self.config.get("save_data") - if save_data: - path = self.config["save_data_path"] + if save_data_path: + path = save_data_path os.stat(path) if not os.access(path, os.R_OK | os.W_OK): raise PermissionError("Permission denied: %r" % path) + self.save_data_path = save_data_path + def save_data(self) -> None: """Save the data associated to the current load""" raise NotImplementedError @@ -129,7 +179,7 @@ origin_url_hash = hashlib.sha1(url).hexdigest() path = "%s/sha1:%s/%s/%s" % ( - self.config["save_data_path"], + self.save_data_path, origin_url_hash[0:2], origin_url_hash, year, @@ -146,21 +196,19 @@ """ self.storage.flush() - @abstractmethod def cleanup(self) -> None: """Last step executed by the loader. """ - pass + raise NotImplementedError - @abstractmethod def prepare_origin_visit(self, *args, **kwargs) -> None: """First step executed by the loader to prepare origin and visit references. Set/update self.origin, and optionally self.origin_url, self.visit_date. """ - pass + raise NotImplementedError def _store_origin_visit(self) -> None: """Store origin and visit references. Sets the self.visit references. @@ -185,7 +233,6 @@ ) )[0] - @abstractmethod def prepare(self, *args, **kwargs) -> None: """Second step executed by the loader to prepare some state needed by the loader. @@ -194,7 +241,7 @@ NotFound exception if the origin to ingest is not found. """ - pass + raise NotImplementedError def get_origin(self) -> Origin: """Get the origin that is currently being loaded. @@ -207,7 +254,6 @@ assert self.origin return self.origin - @abstractmethod def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading (ex: git/hg/svn/... repository). @@ -217,16 +263,15 @@ to be called again to complete loading. """ - pass + raise NotImplementedError - @abstractmethod def store_data(self): """Store fetched data in the database. Should call the :func:`maybe_load_xyz` methods, which handle the bundles sent to storage, rather than send directly. """ - pass + raise NotImplementedError def store_metadata(self) -> None: """Store fetched metadata in the database. @@ -423,7 +468,7 @@ def store_data(self) -> None: assert self.origin - if self.config.get("save_data"): + if self.save_data_path: self.save_data() if self.has_contents(): diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -7,7 +7,7 @@ import hashlib import logging -from swh.loader.core.loader import DEFAULT_CONFIG, BaseLoader, DVCSLoader +from swh.loader.core.loader import BaseLoader, DVCSLoader from swh.loader.exception import NotFound from swh.loader.tests import assert_last_visit_matches from swh.model.hashutil import hash_to_bytes @@ -77,54 +77,51 @@ pass -def test_base_loader(swh_config): - loader = DummyBaseLoader() +def test_base_loader(swh_storage): + loader = DummyBaseLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} -def test_base_loader_with_config(swh_config): - loader = DummyBaseLoader("logger-name", DEFAULT_CONFIG) +def test_base_loader_with_config(swh_storage): + loader = DummyBaseLoader(swh_storage, "logger-name") result = loader.load() assert result == {"status": "eventful"} -def test_dvcs_loader(swh_config): - loader = DummyDVCSLoader() +def test_dvcs_loader(swh_storage): + loader = DummyDVCSLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} -def test_dvcs_loader_with_config(swh_config): - loader = DummyDVCSLoader("another-logger", DEFAULT_CONFIG) +def test_dvcs_loader_with_config(swh_storage): + loader = DummyDVCSLoader(swh_storage, "another-logger") result = loader.load() assert result == {"status": "eventful"} -def test_loader_logger_default_name(swh_config): - loader = DummyBaseLoader() +def test_loader_logger_default_name(swh_storage): + loader = DummyBaseLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyBaseLoader" - loader = DummyDVCSLoader() + loader = DummyDVCSLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyDVCSLoader" -def test_loader_logger_with_name(swh_config): - loader = DummyBaseLoader("some.logger.name") +def test_loader_logger_with_name(swh_storage): + loader = DummyBaseLoader(swh_storage, "some.logger.name") assert isinstance(loader.log, logging.Logger) assert loader.log.name == "some.logger.name" -def test_loader_save_data_path(swh_config, tmp_path): - loader = DummyBaseLoader("some.logger.name.1") +def test_loader_save_data_path(swh_storage, tmp_path): + loader = DummyBaseLoader(swh_storage, "some.logger.name.1", save_data_path=tmp_path) url = "http://bitbucket.org/something" loader.origin = Origin(url=url) loader.visit_date = datetime.datetime(year=2019, month=10, day=1) - loader.config = { - "save_data_path": tmp_path, - } hash_url = hashlib.sha1(url.encode("utf-8")).hexdigest() expected_save_path = "%s/sha1:%s/%s/2019" % (str(tmp_path), hash_url[0:2], hash_url) @@ -164,11 +161,11 @@ raise RuntimeError("Failed to get contents!") -def test_dvcs_loader_exc_partial_visit(swh_config, caplog): +def test_dvcs_loader_exc_partial_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) - loader = DummyDVCSLoaderExc(logging_class=logger_name) + loader = DummyDVCSLoaderExc(swh_storage, logging_class=logger_name) # fake the loading ending up in a snapshot loader.loaded_snapshot_id = hash_to_bytes( "9e4dd2b40d1b46b70917c0949aa2195c823a648e" @@ -203,11 +200,11 @@ self.storage = BrokenStorageProxy(self.storage) -def test_dvcs_loader_storage_exc_failed_visit(swh_config, caplog): +def test_dvcs_loader_storage_exc_failed_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) - loader = DummyDVCSLoaderStorageExc(logging_class=logger_name) + loader = DummyDVCSLoaderStorageExc(swh_storage, logging_class=logger_name) result = loader.load() assert result == {"status": "failed"} @@ -231,8 +228,8 @@ } -def test_loader_not_found(swh_config, caplog): - loader = DummyDVCSLoaderNotFound() +def test_loader_not_found(swh_storage, caplog): + loader = DummyDVCSLoaderNotFound(swh_storage) result = loader.load() assert result == {"status": "uneventful"} diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,6 +20,7 @@ Sha1Git, TimestampWithTimezone, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) SWH_PERSON = Person( @@ -71,9 +72,11 @@ def __init__( self, + storage: StorageInterface, url: str, artifacts: Sequence[Dict[str, Any]], identity_artifact_keys: Optional[Sequence[str]] = None, + max_content_size: Optional[int] = None, ): """Loader constructor. @@ -98,7 +101,7 @@ "identity" of an artifact """ - super().__init__(url=url) + super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.artifacts = artifacts # assume order is enforced in the lister self.identity_artifact_keys = identity_artifact_keys diff --git a/swh/loader/package/archive/tasks.py b/swh/loader/package/archive/tasks.py --- a/swh/loader/package/archive/tasks.py +++ b/swh/loader/package/archive/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,5 @@ @shared_task(name=__name__ + ".LoadArchive") def load_archive_files(*, url=None, artifacts=None): """Load archive's artifacts (e.g gnu, etc...)""" - return ArchiveLoader(url, artifacts).load() + loader = ArchiveLoader.from_configfile(url=url, artifacts=artifacts) + return loader.load() diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -70,10 +70,11 @@ } -def visit_with_no_artifact_found(swh_config, requests_mock_datadir): +def test_archive_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): url = URL unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" loader = ArchiveLoader( + swh_storage, url, artifacts=[ { @@ -89,7 +90,7 @@ actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 0, @@ -102,20 +103,20 @@ "snapshot": 1, } == stats - assert_last_visit_matches(loader.storage, url, status="partial", type="tar") + assert_last_visit_matches(swh_storage, url, status="partial", type="tar") -def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): - loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) +def test_archive_check_revision_metadata_structure(swh_storage, requests_mock_datadir): + loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(loader.storage, URL, status="full", type="tar") + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3") - revision = loader.storage.revision_get([expected_revision_id])[0] + revision = swh_storage.revision_get([expected_revision_id])[0] assert revision is not None check_metadata_paths( @@ -136,11 +137,13 @@ ) -def test_visit_with_release_artifact_no_prior_visit(swh_config, requests_mock_datadir): +def test_archive_visit_with_release_artifact_no_prior_visit( + swh_storage, requests_mock_datadir +): """With no prior visit, load a gnu project ends up with 1 snapshot """ - loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) + loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" @@ -154,9 +157,9 @@ == expected_snapshot_first_visit_id ) - assert_last_visit_matches(loader.storage, URL, status="full", type="tar") + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), @@ -169,13 +172,13 @@ } == stats expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) - assert list(loader.storage.content_missing_per_sha1(expected_contents)) == [] + assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) - assert list(loader.storage.directory_missing(expected_dirs)) == [] + assert list(swh_storage.directory_missing(expected_dirs)) == [] expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) - assert list(loader.storage.revision_missing(expected_revs)) == [] + assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=expected_snapshot_first_visit_id, @@ -190,28 +193,28 @@ }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) -def test_2_visits_without_change(swh_config, requests_mock_datadir): +def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL - loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) + loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] is not None assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url @@ -221,21 +224,21 @@ assert len(urls) == 1 -def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): +def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL artifact1 = GNU_ARTIFACTS[0] - loader = ArchiveLoader(url, [artifact1]) + loader = ArchiveLoader(swh_storage, url, [artifact1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), @@ -262,17 +265,15 @@ "version": "0.2.0", } - loader2 = ArchiveLoader(url, [artifact1, artifact2]) - # implementation detail: share the storage in between visits - loader2.storage = loader.storage - stats2 = get_stats(loader2.storage) + loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) + stats2 = get_stats(swh_storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2["status"] == "eventful" assert actual_load_status2["snapshot_id"] is not None - stats2 = get_stats(loader.storage) + stats2 = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 8, @@ -284,7 +285,7 @@ "snapshot": 1 + 1, } == stats2 - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url @@ -295,7 +296,7 @@ assert len(urls) == 2 -def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): +def test_archive_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot """ @@ -315,18 +316,21 @@ # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( - url, artifacts=artifacts, identity_artifact_keys=["sha256", "length", "url"] + swh_storage, + url, + artifacts=artifacts, + identity_artifact_keys=["sha256", "length", "url"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] - assert_last_visit_matches(loader.storage, url, status="full", type="tar") + assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url @@ -336,7 +340,7 @@ assert len(urls) == 1 -def test_artifact_identity(): +def test_archive_artifact_identity(): """Compute primary key should return the right identity """ diff --git a/swh/loader/package/archive/tests/test_tasks.py b/swh/loader/package/archive/tests/test_tasks.py --- a/swh/loader/package/archive/tests/test_tasks.py +++ b/swh/loader/package/archive/tests/test_tasks.py @@ -1,21 +1,21 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_archive_loader( +def test_tasks_archive_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.archive.loader.ArchiveLoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_load = mocker.patch("swh.loader.package.archive.loader.ArchiveLoader.load") + mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.archive.tasks.LoadArchive", - kwargs={"url": "some-url", "artifacts": []}, + kwargs=dict(url="https://gnu.org/", artifacts=[]), ) assert res res.wait() assert res.successful() - + assert mock_load.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -24,6 +24,7 @@ Sha1Git, TimestampWithTimezone, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @@ -52,7 +53,13 @@ class CRANLoader(PackageLoader[CRANPackageInfo]): visit_type = "cran" - def __init__(self, url: str, artifacts: List[Dict]): + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict], + max_content_size: Optional[int] = None, + ): """Loader constructor. Args: @@ -60,7 +67,7 @@ artifacts: List of associated artifact for the origin url """ - super().__init__(url=url) + super().__init__(storage=storage, url=url, max_content_size=max_content_size) # explicit what we consider the artifact identity self.artifacts = artifacts diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py --- a/swh/loader/package/cran/tasks.py +++ b/swh/loader/package/cran/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,4 @@ @shared_task(name=__name__ + ".LoadCRAN") def load_cran(url=None, artifacts=[]): """Load CRAN's artifacts""" - return CRANLoader(url, artifacts).load() + return CRANLoader.from_configfile(url=url, artifacts=artifacts).load() diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py --- a/swh/loader/package/cran/tests/test_cran.py +++ b/swh/loader/package/cran/tests/test_cran.py @@ -121,7 +121,7 @@ @pytest.mark.fs -def test_extract_intrinsic_metadata(tmp_path, datadir): +def test_cran_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) # sample url @@ -152,7 +152,7 @@ @pytest.mark.fs -def test_extract_intrinsic_metadata_failures(tmp_path): +def test_cran_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" # inexistent first level path assert extract_intrinsic_metadata("/something-inexistent") == {} @@ -164,7 +164,7 @@ assert extract_intrinsic_metadata(tmp_path) == {} -def test_cran_one_visit(swh_config, requests_mock_datadir): +def test_cran_one_visit(swh_storage, requests_mock_datadir): version = "2.22-6" base_url = "https://cran.r-project.org" origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html" @@ -172,7 +172,7 @@ f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz" # noqa ) loader = CRANLoader( - origin_url, artifacts=[{"url": artifact_url, "version": version,}] + swh_storage, origin_url, artifacts=[{"url": artifact_url, "version": version,}] ) actual_load_status = loader.load() @@ -182,11 +182,11 @@ "snapshot_id": SNAPSHOT.id.hex(), } - check_snapshot(SNAPSHOT, loader.storage) + check_snapshot(SNAPSHOT, swh_storage) - assert_last_visit_matches(loader.storage, origin_url, status="full", type="cran") + assert_last_visit_matches(swh_storage, origin_url, status="full", type="cran") - visit_stats = get_stats(loader.storage) + visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, @@ -207,7 +207,7 @@ assert len(urls) == 1 -def test_cran_2_visits_same_origin(swh_config, requests_mock_datadir): +def test_cran_2_visits_same_origin(swh_storage, requests_mock_datadir): """Multiple visits on the same origin, only 1 archive fetch""" version = "2.22-6" base_url = "https://cran.r-project.org" @@ -216,7 +216,7 @@ f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz" # noqa ) loader = CRANLoader( - origin_url, artifacts=[{"url": artifact_url, "version": version}] + swh_storage, origin_url, artifacts=[{"url": artifact_url, "version": version}] ) # first visit @@ -228,11 +228,11 @@ "snapshot_id": SNAPSHOT.id.hex(), } - check_snapshot(SNAPSHOT, loader.storage) + check_snapshot(SNAPSHOT, swh_storage) - assert_last_visit_matches(loader.storage, origin_url, status="full", type="cran") + assert_last_visit_matches(swh_storage, origin_url, status="full", type="cran") - visit_stats = get_stats(loader.storage) + visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, @@ -252,9 +252,9 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, origin_url, status="full", type="cran") + assert_last_visit_matches(swh_storage, origin_url, status="full", type="cran") - visit_stats2 = get_stats(loader.storage) + visit_stats2 = get_stats(swh_storage) visit_stats["origin_visit"] += 1 assert visit_stats2 == visit_stats, "same stats as 1st visit, +1 visit" @@ -266,7 +266,7 @@ assert len(urls) == 1, "visited one time artifact url (across 2 visits)" -def test_parse_debian_control(datadir): +def test_cran_parse_debian_control(datadir): description_file = os.path.join(datadir, "description", "acepack") actual_metadata = parse_debian_control(description_file) @@ -287,7 +287,7 @@ } -def test_parse_debian_control_unicode_issue(datadir): +def test_cran_parse_debian_control_unicode_issue(datadir): # iso-8859-1 caused failure, now fixed description_file = os.path.join(datadir, "description", "KnownBR") @@ -317,7 +317,7 @@ ["build_extrinsic_snapshot_metadata", "build_extrinsic_origin_metadata",], ) def test_cran_fail_to_build_or_load_extrinsic_metadata( - method_name, swh_config, requests_mock_datadir + method_name, swh_storage, requests_mock_datadir ): """problem during loading: {visit: failed, status: failed, no snapshot} @@ -335,7 +335,9 @@ side_effect=ValueError("Fake to fail to build or load extrinsic metadata"), ): loader = CRANLoader( - origin_url, artifacts=[{"url": artifact_url, "version": version}] + swh_storage, + origin_url, + artifacts=[{"url": artifact_url, "version": version}], ) actual_load_status = loader.load() @@ -345,7 +347,7 @@ "snapshot_id": SNAPSHOT.id.hex(), } - visit_stats = get_stats(loader.storage) + visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, @@ -358,5 +360,5 @@ } == visit_stats assert_last_visit_matches( - loader.storage, origin_url, status="partial", type="cran" + swh_storage, origin_url, status="partial", type="cran" ) diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py --- a/swh/loader/package/cran/tests/test_tasks.py +++ b/swh/loader/package/cran/tests/test_tasks.py @@ -1,24 +1,23 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_cran_loader( +def test_tasks_cran_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.cran.loader.CRANLoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_load = mocker.patch("swh.loader.package.cran.loader.CRANLoader.load") + mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.cran.tasks.LoadCRAN", - kwargs={ - "url": "some-url", - "artifacts": {"version": "1.2.3", "url": "artifact-url"}, - }, + kwargs=dict( + url="some-url", artifacts=[{"version": "1.2.3", "url": "artifact-url"}], + ), ) assert res res.wait() assert res.successful() - + assert mock_load.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -34,6 +34,7 @@ Sha1Git, TimestampWithTimezone, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") @@ -103,7 +104,14 @@ visit_type = "deb" - def __init__(self, url: str, date: str, packages: Mapping[str, Any]): + def __init__( + self, + storage: StorageInterface, + url: str, + date: str, + packages: Mapping[str, Any], + max_content_size: Optional[int] = None, + ): """Debian Loader implementation. Args: @@ -143,7 +151,7 @@ } """ - super().__init__(url=url) + super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.packages = packages def get_versions(self) -> Sequence[str]: diff --git a/swh/loader/package/debian/tasks.py b/swh/loader/package/debian/tasks.py --- a/swh/loader/package/debian/tasks.py +++ b/swh/loader/package/debian/tasks.py @@ -11,4 +11,5 @@ @shared_task(name=__name__ + ".LoadDebian") def load_deb_package(*, url, date, packages): """Load Debian package""" - return DebianLoader(url, date, packages).load() + loader = DebianLoader.from_configfile(url=url, date=date, packages=packages) + return loader.load() diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -99,12 +99,15 @@ } -def test_debian_first_visit(swh_config, requests_mock_datadir): +def test_debian_first_visit(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = DebianLoader( - url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION, + swh_storage, + URL, + date="2019-10-12T05:58:09.165557+00:00", + packages=PACKAGE_PER_VERSION, ) actual_load_status = loader.load() @@ -114,9 +117,9 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, URL, status="full", type="deb") + assert_last_visit_matches(swh_storage, URL, status="full", type="deb") - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 42, "directory": 2, @@ -138,15 +141,18 @@ }, ) # different than the previous loader as no release is done - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) -def test_debian_first_visit_then_another_visit(swh_config, requests_mock_datadir): +def test_debian_first_visit_then_another_visit(swh_storage, requests_mock_datadir): """With no prior visit, load a debian project ends up with 1 snapshot """ loader = DebianLoader( - url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION + swh_storage, + URL, + date="2019-10-12T05:58:09.165557+00:00", + packages=PACKAGE_PER_VERSION, ) actual_load_status = loader.load() @@ -157,9 +163,9 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, URL, status="full", type="deb") + assert_last_visit_matches(swh_storage, URL, status="full", type="deb") - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 42, "directory": 2, @@ -181,14 +187,14 @@ }, ) # different than the previous loader as no release is done - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) # No change in between load actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" - assert_last_visit_matches(loader.storage, URL, status="full", type="deb") + assert_last_visit_matches(swh_storage, URL, status="full", type="deb") - stats2 = get_stats(loader.storage) + stats2 = get_stats(swh_storage) assert { "content": 42 + 0, "directory": 2 + 0, @@ -209,7 +215,7 @@ assert len(urls) == len(set(urls)) -def test_uid_to_person(): +def test_debian_uid_to_person(): uid = "Someone Name " actual_person = uid_to_person(uid) @@ -220,7 +226,7 @@ } -def test_prepare_person(): +def test_debian_prepare_person(): actual_author = prepare_person( { "name": "Someone Name", @@ -236,7 +242,7 @@ ) -def test_download_package(datadir, tmpdir, requests_mock_datadir): +def test_debian_download_package(datadir, tmpdir, requests_mock_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) all_hashes = download_package(p_info, tmpdir) @@ -279,7 +285,7 @@ } -def test_dsc_information_ok(): +def test_debian_dsc_information_ok(): fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) dsc_url, dsc_name = dsc_information(p_info) @@ -288,7 +294,7 @@ assert dsc_name == PACKAGE_FILES["files"][fname]["name"] -def test_dsc_information_not_found(): +def test_debian_dsc_information_not_found(): fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) p_info.files.pop(fname) @@ -299,7 +305,7 @@ assert dsc_name is None -def test_dsc_information_too_many_dsc_entries(): +def test_debian_dsc_information_too_many_dsc_entries(): # craft an extra dsc file fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) @@ -315,7 +321,9 @@ dsc_information(p_info) -def test_get_intrinsic_package_metadata(requests_mock_datadir, datadir, tmp_path): +def test_debian_get_intrinsic_package_metadata( + requests_mock_datadir, datadir, tmp_path +): tmp_path = str(tmp_path) # py3.5 compat. p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) @@ -372,9 +380,12 @@ ) -def test_debian_multiple_packages(swh_config, requests_mock_datadir): +def test_debian_multiple_packages(swh_storage, requests_mock_datadir): loader = DebianLoader( - url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGES_PER_VERSION + swh_storage, + URL, + date="2019-10-12T05:58:09.165557+00:00", + packages=PACKAGES_PER_VERSION, ) actual_load_status = loader.load() @@ -384,7 +395,7 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, URL, status="full", type="deb") + assert_last_visit_matches(swh_storage, URL, status="full", type="deb") expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), @@ -400,10 +411,10 @@ }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) -def test_resolve_revision_from_edge_cases(): +def test_debian_resolve_revision_from_edge_cases(): """Solving revision with empty data will result in unknown revision """ @@ -435,7 +446,7 @@ ) -def test_resolve_revision_from_edge_cases_hit_and_miss(): +def test_debian_resolve_revision_from_edge_cases_hit_and_miss(): """Solving revision with inconsistent data will result in unknown revision """ @@ -456,7 +467,7 @@ assert actual_revision is None -def test_resolve_revision_from(): +def test_debian_resolve_revision_from(): """Solving revision with consistent data will solve the revision """ diff --git a/swh/loader/package/debian/tests/test_tasks.py b/swh/loader/package/debian/tests/test_tasks.py --- a/swh/loader/package/debian/tests/test_tasks.py +++ b/swh/loader/package/debian/tests/test_tasks.py @@ -1,21 +1,21 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_debian_loader( +def test_tasks_debian_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.debian.loader.DebianLoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_load = mocker.patch("swh.loader.package.debian.loader.DebianLoader.load") + mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.debian.tasks.LoadDebian", - kwargs={"url": "some-url", "date": "some-date", "packages": {}}, + kwargs=dict(url="some-url", date="some-date", packages={}), ) assert res res.wait() assert res.successful() - + assert mock_load.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -12,6 +12,8 @@ import attr import requests +from swh.core.config import load_from_envvar +from swh.loader.core.loader import DEFAULT_CONFIG from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, @@ -30,6 +32,7 @@ TimestampWithTimezone, ) from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @@ -107,19 +110,41 @@ visit_type = "deposit" - def __init__(self, url: str, deposit_id: str): + def __init__( + self, + storage: StorageInterface, + url: str, + deposit_id: str, + deposit_client: "ApiClient", + max_content_size: Optional[int] = None, + ): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity + deposit_client: Deposit api client """ - super().__init__(url=url) + super().__init__(storage=storage, url=url, max_content_size=max_content_size) - config_deposit = self.config["deposit"] self.deposit_id = deposit_id - self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) + self.client = deposit_client + + @classmethod + def from_configfile(cls, **kwargs: Any): + """Instantiate a loader from the configuration loaded from the + SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their + value is not None. + + Args: + kwargs: kwargs passed to the loader instantiation + + """ + config = dict(load_from_envvar(DEFAULT_CONFIG)) + config.update({k: v for k, v in kwargs.items() if v is not None}) + deposit_client = ApiClient(**config.pop("deposit")) + return cls.from_config(deposit_client=deposit_client, **config) def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot diff --git a/swh/loader/package/deposit/tasks.py b/swh/loader/package/deposit/tasks.py --- a/swh/loader/package/deposit/tasks.py +++ b/swh/loader/package/deposit/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,4 @@ @shared_task(name=__name__ + ".LoadDeposit") def load_deposit(*, url, deposit_id): """Load Deposit artifacts""" - return DepositLoader(url, deposit_id).load() + return DepositLoader.from_configfile(url=url, deposit_id=deposit_id).load() diff --git a/swh/loader/package/deposit/tests/conftest.py b/swh/loader/package/deposit/tests/conftest.py --- a/swh/loader/package/deposit/tests/conftest.py +++ b/swh/loader/package/deposit/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,6 +8,8 @@ import pytest +from swh.loader.package.deposit.loader import ApiClient + @pytest.fixture def swh_loader_config(swh_loader_config) -> Dict[str, Any]: @@ -21,3 +23,8 @@ } ) return config + + +@pytest.fixture +def deposit_client(swh_loader_config): + return ApiClient(**swh_loader_config["deposit"]) diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,7 +11,7 @@ import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory -from swh.loader.package.deposit.loader import DepositLoader +from swh.loader.package.deposit.loader import ApiClient, DepositLoader from swh.loader.package.loader import now from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats @@ -41,17 +41,30 @@ return requests_mock_datadir -def test_deposit_init_ok(swh_config, swh_loader_config): +def test_deposit_init_ok(swh_storage, deposit_client, swh_loader_config): url = "some-url" deposit_id = 999 - loader = DepositLoader(url, deposit_id) # Something that does not exist + loader = DepositLoader( + swh_storage, url, deposit_id, deposit_client + ) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] -def test_deposit_loading_unknown_deposit(swh_config, requests_mock_datadir): +def test_deposit_from_configfile(swh_config): + """Ensure the deposit instantiation is ok + + """ + loader = DepositLoader.from_configfile(url="some-url", deposit_id="666") + + assert isinstance(loader.client, ApiClient) + + +def test_deposit_loading_unknown_deposit( + swh_storage, deposit_client, requests_mock_datadir +): """Loading an unknown deposit should fail no origin, no visit, no snapshot @@ -59,7 +72,9 @@ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 - loader = DepositLoader(url, unknown_deposit_id) # does not exist + loader = DepositLoader( + swh_storage, url, unknown_deposit_id, deposit_client + ) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} @@ -84,7 +99,7 @@ def test_deposit_loading_failure_to_retrieve_1_artifact( - swh_config, requests_mock_datadir_missing_one + swh_storage, deposit_client, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit @@ -92,7 +107,7 @@ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 - loader = DepositLoader(url, deposit_id) + loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" @@ -113,10 +128,12 @@ } == stats -def test_revision_metadata_structure(swh_config, requests_mock_datadir): +def test_deposit_revision_metadata_structure( + swh_storage, deposit_client, requests_mock_datadir +): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 - loader = DepositLoader(url, deposit_id) + loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" @@ -145,10 +162,10 @@ ) -def test_deposit_loading_ok(swh_config, requests_mock_datadir): +def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 - loader = DepositLoader(url, deposit_id) + loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() expected_snapshot_id = "b2b327b33dc85818bd23c3ccda8b7e675a66ecbd" @@ -244,14 +261,14 @@ assert body == expected_body -def test_deposit_loading_ok_2(swh_config, requests_mock_datadir): +def test_deposit_loading_ok_2(swh_storage, deposit_client, requests_mock_datadir): """Field dates should be se appropriately """ external_id = "some-external-id" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 - loader = DepositLoader(url, deposit_id) + loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() expected_snapshot_id = "3e68440fdd7c81d283f8f3aebb6f0c8657864192" diff --git a/swh/loader/package/deposit/tests/test_tasks.py b/swh/loader/package/deposit/tests/test_tasks.py --- a/swh/loader/package/deposit/tests/test_tasks.py +++ b/swh/loader/package/deposit/tests/test_tasks.py @@ -1,21 +1,24 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_deposit_loader( +def test_tasks_deposit_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.deposit.loader.DepositLoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_loader = mocker.patch( + "swh.loader.package.deposit.loader.DepositLoader.from_configfile" + ) + mock_loader.return_value = mock_loader + mock_loader.load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.deposit.tasks.LoadDeposit", - kwargs={"url": "some-url", "deposit_id": "some-d-id",}, + kwargs=dict(url="some-url", deposit_id="some-d-id",), ) assert res res.wait() assert res.successful() - + assert mock_loader.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -27,8 +27,8 @@ import attr import sentry_sdk -from swh.core.config import load_from_envvar from swh.core.tarball import uncompress +from swh.loader.core.loader import Loader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk @@ -49,7 +49,6 @@ Snapshot, TargetType, ) -from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now @@ -118,42 +117,27 @@ TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) -DEFAULT_CONFIG = { - "max_content_size": 100 * 1024 * 1024, - "create_authorities": True, - "create_fetchers": True, -} - - -class PackageLoader(Generic[TPackageInfo]): +class PackageLoader(Loader, Generic[TPackageInfo]): # Origin visit type (str) set by the loader visit_type = "" - def __init__(self, url): + def __init__( + self, + storage: StorageInterface, + url: str, + max_content_size: Optional[int] = None, + ): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: - url (str): Origin url to load data from + storage: Storage instance + url: Origin url to load data from """ - # This expects to use the environment variable SWH_CONFIG_FILENAME - self.config = load_from_envvar(DEFAULT_CONFIG) - self._check_configuration() - self.storage: StorageInterface = get_storage(**self.config["storage"]) + super().__init__(storage=storage, max_content_size=max_content_size) self.url = url self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - self.max_content_size = self.config["max_content_size"] - - def _check_configuration(self): - """Checks the minimal configuration required is set for the loader. - - If some required configuration is missing, exception detailing the - issue is raised. - - """ - if "storage" not in self.config: - raise ValueError("Misconfiguration, at least the storage key should be set") def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -28,6 +28,7 @@ Snapshot, TargetType, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @@ -58,9 +59,15 @@ visit_type = "nixguix" - def __init__(self, url): - super().__init__(url=url) + def __init__( + self, + storage: StorageInterface, + url: str, + unsupported_file_extensions: List[str] = [], + ): + super().__init__(storage=storage, url=url) self.provider_url = url + self.unsupported_file_extensions = unsupported_file_extensions # Note: this could be renamed get_artifacts in the PackageLoader # base class. @@ -71,8 +78,9 @@ @cached_method def supported_sources(self): raw_sources = self.raw_sources() - unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) - return clean_sources(parse_sources(raw_sources), unsupported_file_extensions) + return clean_sources( + parse_sources(raw_sources), self.unsupported_file_extensions + ) @cached_method def integrity_by_url(self) -> Dict[str, Any]: diff --git a/swh/loader/package/nixguix/tasks.py b/swh/loader/package/nixguix/tasks.py --- a/swh/loader/package/nixguix/tasks.py +++ b/swh/loader/package/nixguix/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,4 @@ @shared_task(name=__name__ + ".LoadNixguix") def load_nixguix(*, url=None): """Load functional (e.g. guix/nix) package""" - return NixGuixLoader(url).load() + return NixGuixLoader.from_configfile(url=url).load() diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -100,13 +100,13 @@ assert "integrity" in raw -def test_retrieve_sources(swh_config, requests_mock_datadir): +def test_retrieve_sources(swh_storage, requests_mock_datadir): j = parse_sources(retrieve_sources(sources_url)) assert "sources" in j.keys() assert len(j["sources"]) == 2 -def test_nixguix_url_not_found(swh_config, requests_mock_datadir): +def test_nixguix_url_not_found(swh_storage, requests_mock_datadir): """When failing to read from the url, the visit is marked as not_found. Here the sources url does not exist, so requests_mock_datadir returns a 404. @@ -117,21 +117,21 @@ """ unknown_url = "https://non-existing-url/" - loader = NixGuixLoader(unknown_url) + loader = NixGuixLoader(swh_storage, unknown_url) # during the retrieval step load_status = loader.load() assert load_status == {"status": "failed"} assert_last_visit_matches( - loader.storage, unknown_url, status="not_found", type="nixguix", snapshot=None + swh_storage, unknown_url, status="not_found", type="nixguix", snapshot=None ) assert len(requests_mock_datadir.request_history) == 1 assert requests_mock_datadir.request_history[0].url == unknown_url -def test_nixguix_url_with_decoding_error(swh_config, requests_mock_datadir): +def test_nixguix_url_with_decoding_error(swh_storage, requests_mock_datadir): """Other errors during communication with the url, the visit is marked as failed requests_mock_datadir will intercept the requests to sources_url. Since the file @@ -140,26 +140,26 @@ """ sources_url = "https://example.com/file.txt" - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() assert load_status == {"status": "failed"} assert_last_visit_matches( - loader.storage, sources_url, status="failed", type="nixguix", snapshot=None + swh_storage, sources_url, status="failed", type="nixguix", snapshot=None ) assert len(requests_mock_datadir.request_history) == 1 assert requests_mock_datadir.request_history[0].url == sources_url -def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): +def test_clean_sources_invalid_schema(swh_storage, requests_mock_datadir): sources = {} with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): clean_sources(sources) -def test_clean_sources_invalid_version(swh_config, requests_mock_datadir): +def test_clean_sources_invalid_version(swh_storage, requests_mock_datadir): for version_ok in [1, "1"]: # Check those versions are fine clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) @@ -172,7 +172,7 @@ ) -def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): +def test_clean_sources_invalid_sources(swh_storage, requests_mock_datadir): valid_sources = [ # 1 valid source {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, @@ -218,7 +218,7 @@ assert actual_match -def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): +def test_clean_sources_unsupported_artifacts(swh_storage, requests_mock_datadir): unsupported_file_extensions = [ "iso", "whl", @@ -278,12 +278,12 @@ assert len(clean["sources"]) == len(supported_sources) -def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources): - loader = NixGuixLoader(sources_url) +def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources): + loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() assert res["status"] == "eventful" - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 1, "directory": 3, @@ -298,10 +298,10 @@ # The visit is partial because urls pointing to non tarball file # are not handled yet assert_last_visit_matches( - loader.storage, sources_url, status="partial", type="nixguix" + swh_storage, sources_url, status="partial", type="nixguix" ) - visit_status = origin_get_latest_visit_status(loader.storage, sources_url) + visit_status = origin_get_latest_visit_status(swh_storage, sources_url) snapshot_swhid = SWHID( object_type="snapshot", object_id=hash_to_hex(visit_status.snapshot) ) @@ -323,12 +323,12 @@ origin=sources_url, ) ] - assert loader.storage.raw_extrinsic_metadata_get( + assert swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.SNAPSHOT, snapshot_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) -def test_uncompress_failure(swh_config, requests_mock_datadir): +def test_uncompress_failure(swh_storage, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. @@ -337,7 +337,7 @@ created (with a status partial since all files are not archived). """ - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) loader_status = loader.load() sources = loader.supported_sources()["sources"] @@ -348,30 +348,30 @@ # The visit is partial because urls pointing to non tarball files # are not handled yet assert_last_visit_matches( - loader.storage, sources_url, status="partial", type="nixguix" + swh_storage, sources_url, status="partial", type="nixguix" ) -def test_loader_incremental(swh_config, requests_mock_datadir): +def test_loader_incremental(swh_storage, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( - loader.storage, + swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) - check_snapshot(SNAPSHOT1, storage=loader.storage) + check_snapshot(SNAPSHOT1, storage=swh_storage) urls = [ m.url @@ -384,7 +384,7 @@ assert len(urls) == 1 -def test_loader_two_visits(swh_config, requests_mock_datadir_visits): +def test_loader_two_visits(swh_storage, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. @@ -393,21 +393,21 @@ another tarball. """ - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( - loader.storage, + swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) - check_snapshot(SNAPSHOT1, storage=loader.storage) + check_snapshot(SNAPSHOT1, storage=swh_storage) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 1, "directory": 3, @@ -419,7 +419,7 @@ "snapshot": 1, } == stats - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() expected_snapshot_id_hex = "b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97" expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) @@ -429,7 +429,7 @@ } assert_last_visit_matches( - loader.storage, + swh_storage, sources_url, status="partial", type="nixguix", @@ -456,9 +456,9 @@ ), }, ) - check_snapshot(expected_snapshot, storage=loader.storage) + check_snapshot(expected_snapshot, storage=swh_storage) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 2, "directory": 5, @@ -471,8 +471,8 @@ } == stats -def test_resolve_revision_from(swh_config, requests_mock_datadir, datadir): - loader = NixGuixLoader(sources_url) +def test_resolve_revision_from(swh_storage, requests_mock_datadir, datadir): + loader = NixGuixLoader(swh_storage, sources_url) known_artifacts = { "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}}, @@ -489,23 +489,23 @@ assert loader.resolve_revision_from(known_artifacts, p_info) == None # noqa -def test_evaluation_branch(swh_config, requests_mock_datadir): - loader = NixGuixLoader(sources_url) +def test_evaluation_branch(swh_storage, requests_mock_datadir): + loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() assert res["status"] == "eventful" assert_last_visit_matches( - loader.storage, + swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) - check_snapshot(SNAPSHOT1, storage=loader.storage) + check_snapshot(SNAPSHOT1, storage=swh_storage) -def test_eoferror(swh_config, requests_mock_datadir): +def test_eoferror(swh_storage, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. @@ -514,7 +514,7 @@ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) - loader = NixGuixLoader(sources) + loader = NixGuixLoader(swh_storage, sources) loader.load() expected_snapshot = Snapshot( @@ -527,7 +527,7 @@ }, ) - check_snapshot(expected_snapshot, storage=loader.storage) + check_snapshot(expected_snapshot, storage=swh_storage) def fake_download( @@ -550,11 +550,11 @@ return download(url, dest, hashes, filename, auth) -def test_raise_exception(swh_config, requests_mock_datadir, mocker): +def test_raise_exception(swh_storage, requests_mock_datadir, mocker): mock_download = mocker.patch("swh.loader.package.loader.download") mock_download.side_effect = fake_download - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() assert res == { @@ -562,18 +562,18 @@ "snapshot_id": SNAPSHOT1.id.hex(), } - check_snapshot(SNAPSHOT1, storage=loader.storage) + check_snapshot(SNAPSHOT1, storage=swh_storage) assert len(mock_download.mock_calls) == 2 # The visit is partial because some artifact downloads failed assert_last_visit_matches( - loader.storage, sources_url, status="partial", type="nixguix" + swh_storage, sources_url, status="partial", type="nixguix" ) def test_load_nixguix_one_common_artifact_from_other_loader( - swh_config, datadir, requests_mock_datadir_visits, caplog + swh_storage, datadir, requests_mock_datadir_visits, caplog ): """Misformatted revision should be caught and logged, then loading continues @@ -593,7 +593,7 @@ "version": release, } ] - archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) + archive_loader = ArchiveLoader(swh_storage, url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" assert actual_load_status["status"] == "eventful" @@ -627,16 +627,14 @@ # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" - assert_last_visit_matches( - loader.storage, sources_url, status="full", type="nixguix" - ) + assert_last_visit_matches(swh_storage, sources_url, status="full", type="nixguix") snapshot_id = actual_load_status2["snapshot_id"] - snapshot = snapshot_get_all_branches(loader.storage, hash_to_bytes(snapshot_id)) + snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id)) assert snapshot # simulate a snapshot already seen with a revision with the wrong metadata structure @@ -646,7 +644,7 @@ ) as last_snapshot: # mutate the snapshot to target a revision with the wrong metadata structure # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision - old_revision = loader.storage.revision_get([first_revision.target])[0] + old_revision = swh_storage.revision_get([first_revision.target])[0] # assert that revision is not in the right format assert old_revision.metadata["extrinsic"]["raw"].get("integrity", {}) == {} @@ -666,25 +664,25 @@ # a revision written by somebody else (structure different) last_snapshot.return_value = snapshot - loader = NixGuixLoader(sources_url) + loader = NixGuixLoader(swh_storage, sources_url) actual_load_status3 = loader.load() assert last_snapshot.called assert actual_load_status3["status"] == "eventful" assert_last_visit_matches( - loader.storage, sources_url, status="full", type="nixguix" + swh_storage, sources_url, status="full", type="nixguix" ) new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" assert actual_load_status3["snapshot_id"] == new_snapshot_id last_snapshot = snapshot_get_all_branches( - loader.storage, hash_to_bytes(new_snapshot_id) + swh_storage, hash_to_bytes(new_snapshot_id) ) new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")] assert new_revision_branch.target_type == TargetType.REVISION - new_revision = loader.storage.revision_get([new_revision_branch.target])[0] + new_revision = swh_storage.revision_get([new_revision_branch.target])[0] # the new revision has the correct structure, so it got ingested alright by the # new run diff --git a/swh/loader/package/nixguix/tests/test_tasks.py b/swh/loader/package/nixguix/tests/test_tasks.py --- a/swh/loader/package/nixguix/tests/test_tasks.py +++ b/swh/loader/package/nixguix/tests/test_tasks.py @@ -1,23 +1,17 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json - -def test_nixguix_loader( +def test_tasks_nixguix_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.nixguix.loader.NixGuixLoader.load") - mock_loader.return_value = {"status": "eventful"} - - mock_retrieve_sources = mocker.patch( - "swh.loader.package.nixguix.loader.retrieve_sources" + mock_loader = mocker.patch( + "swh.loader.package.nixguix.loader.NixGuixLoader.from_configfile" ) - mock_retrieve_sources.return_value = json.dumps( - {"version": 1, "sources": [], "revision": "some-revision",} - ).encode() + mock_loader.return_value = mock_loader + mock_loader.load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.nixguix.tasks.LoadNixguix", kwargs=dict(url="some-url") @@ -25,5 +19,5 @@ assert res res.wait() assert res.successful() - + assert mock_loader.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -28,6 +28,7 @@ Sha1Git, TimestampWithTimezone, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @@ -86,13 +87,18 @@ visit_type = "npm" - def __init__(self, url: str): + def __init__( + self, + storage: StorageInterface, + url: str, + max_content_size: Optional[int] = None, + ): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ - super().__init__(url=url) + super().__init__(storage=storage, url=url, max_content_size=max_content_size) package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" diff --git a/swh/loader/package/npm/tasks.py b/swh/loader/package/npm/tasks.py --- a/swh/loader/package/npm/tasks.py +++ b/swh/loader/package/npm/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,4 @@ @shared_task(name=__name__ + ".LoadNpm") def load_npm(*, url: str): """Load Npm package""" - return NpmLoader(url).load() + return NpmLoader.from_configfile(url=url).load() diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -66,7 +66,7 @@ assert _author_str(author) == expected_author -def test_extract_npm_package_author(datadir): +def test_npm_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, "https_replicate.npmjs.com", "org_visit1" ) @@ -305,16 +305,16 @@ return "https://replicate.npmjs.com/%s/" % package -def test_revision_metadata_structure(swh_config, requests_mock_datadir): +def test_npm_revision_metadata_structure(swh_storage, requests_mock_datadir): package = "org" - loader = NpmLoader(package_url(package)) + loader = NpmLoader(swh_storage, package_url(package)) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e") - revision = loader.storage.revision_get([expected_revision_id])[0] + revision = swh_storage.revision_get([expected_revision_id])[0] assert revision is not None check_metadata_paths( @@ -336,10 +336,10 @@ ) -def test_npm_loader_first_visit(swh_config, requests_mock_datadir, org_api_info): +def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d") @@ -349,10 +349,10 @@ } assert_last_visit_matches( - loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), @@ -365,18 +365,15 @@ "snapshot": 1, } == stats - contents = loader.storage.content_get(_expected_new_contents_first_visit) + contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( - list(loader.storage.directory_missing(_expected_new_directories_first_visit)) - == [] + list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) - assert ( - list(loader.storage.revision_missing(_expected_new_revisions_first_visit)) == [] - ) + assert list(swh_storage.revision_missing(_expected_new_revisions_first_visit)) == [] versions = [ ("0.0.2", "d8a1c7474d2956ac598a19f0f27d52f7015f117e"), @@ -399,14 +396,14 @@ }, }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, revision_id) in versions: - revision = loader.storage.revision_get([hash_to_bytes(revision_id)])[0] + revision = swh_storage.revision_get([hash_to_bytes(revision_id)])[0] directory_id = revision.directory directory_swhid = SWHID(object_type="directory", object_id=directory_id,) revision_swhid = SWHID(object_type="revision", object_id=revision_id,) @@ -427,15 +424,15 @@ revision=revision_swhid, ) ] - assert loader.storage.raw_extrinsic_metadata_get( + assert swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) -def test_npm_loader_incremental_visit(swh_config, requests_mock_datadir_visits): +def test_npm_loader_incremental_visit(swh_storage, requests_mock_datadir_visits): package = "org" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) expected_snapshot_id = hash_to_bytes("d0587e1195aed5a8800411a008f2f2d627f18e2d") actual_load_status = loader.load() @@ -444,10 +441,10 @@ "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( - loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), @@ -470,9 +467,9 @@ assert snap_id2 is not None assert snap_id2 != actual_load_status["snapshot_id"] - assert_last_visit_matches(loader.storage, url, status="full", type="npm") + assert_last_visit_matches(swh_storage, url, status="full", type="npm") - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { # 3 new releases artifacts "content": len(_expected_new_contents_first_visit) + 14, @@ -494,10 +491,10 @@ @pytest.mark.usefixtures("requests_mock_datadir") -def test_npm_loader_version_divergence(swh_config): +def test_npm_loader_version_divergence(swh_storage): package = "@aller_shared" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92") @@ -506,10 +503,10 @@ "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( - loader.storage, url, status="full", type="npm", snapshot=expected_snapshot_id + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { # 1 new releases artifacts "content": 534, @@ -538,7 +535,7 @@ ), }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_to_revision_id_none(): @@ -602,13 +599,13 @@ ) -def test_npm_artifact_with_no_intrinsic_metadata(swh_config, requests_mock_datadir): +def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = "nativescript-telerik-analytics" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata @@ -620,20 +617,20 @@ "snapshot_id": expected_snapshot.id.hex(), } - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) -def test_npm_artifact_with_no_upload_time(swh_config, requests_mock_datadir): +def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-no-time" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata @@ -645,20 +642,20 @@ "snapshot_id": expected_snapshot.id.hex(), } - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="partial", type="npm", snapshot=expected_snapshot.id + swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id ) -def test_npm_artifact_use_mtime_if_no_time(swh_config, requests_mock_datadir): +def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-express" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd") @@ -681,34 +678,34 @@ ), }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="npm", snapshot=expected_snapshot.id + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) -def test_npm_no_artifact(swh_config, requests_mock_datadir): +def test_npm_no_artifact(swh_storage, requests_mock_datadir): """If no artifacts at all is found for origin, the visit fails completely """ package = "catify" url = package_url(package) - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "failed", } - assert_last_visit_matches(loader.storage, url, status="failed", type="npm") + assert_last_visit_matches(swh_storage, url, status="failed", type="npm") -def test_npm_origin_not_found(swh_config, requests_mock_datadir): +def test_npm_origin_not_found(swh_storage, requests_mock_datadir): url = package_url("non-existent-url") - loader = NpmLoader(url) + loader = NpmLoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( - loader.storage, url, status="not_found", type="npm", snapshot=None + swh_storage, url, status="not_found", type="npm", snapshot=None ) diff --git a/swh/loader/package/npm/tests/test_tasks.py b/swh/loader/package/npm/tests/test_tasks.py --- a/swh/loader/package/npm/tests/test_tasks.py +++ b/swh/loader/package/npm/tests/test_tasks.py @@ -1,14 +1,14 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_npm_loader( +def test_tasks_npm_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.npm.loader.NpmLoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_load = mocker.patch("swh.loader.package.npm.loader.NpmLoader.load") + mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.npm.tasks.LoadNpm", @@ -17,5 +17,5 @@ assert res res.wait() assert res.successful() - + assert mock_load.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -27,6 +27,7 @@ Sha1Git, TimestampWithTimezone, ) +from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @@ -63,8 +64,13 @@ visit_type = "pypi" - def __init__(self, url): - super().__init__(url=url) + def __init__( + self, + storage: StorageInterface, + url: str, + max_content_size: Optional[int] = None, + ): + super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.provider_url = pypi_api_url(self.url) @cached_method diff --git a/swh/loader/package/pypi/tasks.py b/swh/loader/package/pypi/tasks.py --- a/swh/loader/package/pypi/tasks.py +++ b/swh/loader/package/pypi/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,4 +11,4 @@ @shared_task(name=__name__ + ".LoadPyPI") def load_pypi(*, url=None): """Load PyPI package""" - return PyPILoader(url).load() + return PyPILoader.from_configfile(url=url).load() diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -3,14 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy import json import os from os import path from unittest.mock import patch import pytest -import yaml from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.core.tarball import uncompress @@ -48,7 +46,7 @@ return f.read() -def test_author_basic(): +def test_pypi_author_basic(): data = { "author": "i-am-groot", "author_email": "iam@groot.org", @@ -64,7 +62,7 @@ assert actual_author == expected_author -def test_author_empty_email(): +def test_pypi_author_empty_email(): data = { "author": "i-am-groot", "author_email": "", @@ -76,7 +74,7 @@ assert actual_author == expected_author -def test_author_empty_name(): +def test_pypi_author_empty_name(): data = { "author": "", "author_email": "iam@groot.org", @@ -90,7 +88,7 @@ assert actual_author == expected_author -def test_author_malformed(): +def test_pypi_author_malformed(): data = { "author": "['pierre', 'paul', 'jacques']", "author_email": None, @@ -107,7 +105,7 @@ assert actual_author == expected_author -def test_author_malformed_2(): +def test_pypi_author_malformed_2(): data = { "author": "[marie, jeanne]", "author_email": "[marie@some, jeanne@thing]", @@ -124,7 +122,7 @@ assert actual_author == expected_author -def test_author_malformed_3(): +def test_pypi_author_malformed_3(): data = { "author": "[marie, jeanne, pierre]", "author_email": "[marie@somewhere.org, jeanne@somewhere.org]", @@ -146,20 +144,6 @@ # configuration error # -def test_badly_configured_loader_raise(tmp_path, swh_loader_config, monkeypatch): - """Badly configured loader should raise""" - wrong_config = copy.deepcopy(swh_loader_config) - wrong_config.pop("storage") - - conf_path = os.path.join(str(tmp_path), "loader.yml") - with open(conf_path, "w") as f: - f.write(yaml.dump(wrong_config)) - monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) - - with pytest.raises(ValueError, match="Misconfiguration"): - PyPILoader(url="some-url") - - def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url("https://pypi.org/project/requests") @@ -173,7 +157,7 @@ @pytest.mark.fs -def test_extract_intrinsic_metadata(tmp_path, datadir): +def test_pypi_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( @@ -197,7 +181,7 @@ @pytest.mark.fs -def test_extract_intrinsic_metadata_failures(tmp_path): +def test_pypi_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path @@ -225,18 +209,18 @@ ) -def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all): +def test_pypi_no_release_artifact(swh_storage, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, @@ -248,10 +232,10 @@ "snapshot": 1, } == stats - assert_last_visit_matches(loader.storage, url, status="partial", type="pypi") + assert_last_visit_matches(swh_storage, url, status="partial", type="pypi") -def test_pypi_fail__load_snapshot(swh_config, requests_mock_datadir): +def test_pypi_fail__load_snapshot(swh_storage, requests_mock_datadir): """problem during loading: {visit: failed, status: failed, no snapshot} """ @@ -260,7 +244,7 @@ "swh.loader.package.pypi.loader.PyPILoader._load_snapshot", side_effect=ValueError("Fake problem to fail visit"), ): - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} @@ -278,25 +262,25 @@ "snapshot": 0, } == stats - assert_last_visit_matches(loader.storage, url, status="failed", type="pypi") + assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") # problem during loading: # {visit: partial, status: uneventful, no snapshot} -def test_release_with_traceback(swh_config, requests_mock_datadir): +def test_pypi_release_with_traceback(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/0805nexter" with patch( "swh.loader.package.pypi.loader.PyPILoader.last_snapshot", side_effect=ValueError("Fake problem to fail the visit"), ): - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 0, @@ -309,7 +293,7 @@ "snapshot": 0, } == stats - assert_last_visit_matches(loader.storage, url, status="failed", type="pypi") + assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") # problem during loading: failure early enough in between swh contents... @@ -333,18 +317,18 @@ # {visit partial, status: eventful, 1 snapshot} -def test_revision_metadata_structure( - swh_config, requests_mock_datadir, _0805nexter_api_info +def test_pypi_revision_metadata_structure( + swh_storage, requests_mock_datadir, _0805nexter_api_info ): url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_revision_id = hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21") - revision = loader.storage.revision_get([expected_revision_id])[0] + revision = swh_storage.revision_get([expected_revision_id])[0] assert revision is not None check_metadata_paths( @@ -391,17 +375,19 @@ revision=revision_swhid, ) ] - assert loader.storage.raw_extrinsic_metadata_get( + assert swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) -def test_visit_with_missing_artifact(swh_config, requests_mock_datadir_missing_one): +def test_pypi_visit_with_missing_artifact( + swh_storage, requests_mock_datadir_missing_one +): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("dd0e4201a232b1c104433741dbf45895b8ac9355") @@ -410,7 +396,7 @@ "snapshot_id": expected_snapshot_id.hex(), } - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 3, @@ -432,7 +418,7 @@ ], ) - assert list(loader.storage.content_missing_per_sha1(expected_contents)) == [] + assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, @@ -442,7 +428,7 @@ ], ) - assert list(loader.storage.directory_missing(expected_dirs)) == [] + assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { @@ -450,7 +436,7 @@ "b178b66bd22383d5f16f4f5c923d39ca798861b4" ), # noqa } - assert list(loader.storage.revision_missing(expected_revs)) == [] + assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), @@ -464,23 +450,19 @@ ), }, ) - check_snapshot(expected_snapshot, storage=loader.storage) + check_snapshot(expected_snapshot, storage=swh_storage) assert_last_visit_matches( - loader.storage, - url, - status="partial", - type="pypi", - snapshot=expected_snapshot_id, + swh_storage, url, status="partial", type="pypi", snapshot=expected_snapshot_id, ) -def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir): +def test_pypi_visit_with_1_release_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") @@ -489,7 +471,7 @@ "snapshot_id": expected_snapshot_id.hex(), } - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, @@ -513,7 +495,7 @@ ], ) - assert list(loader.storage.content_missing_per_sha1(expected_contents)) == [] + assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, @@ -525,7 +507,7 @@ ], ) - assert list(loader.storage.directory_missing(expected_dirs)) == [] + assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { @@ -536,7 +518,7 @@ "b178b66bd22383d5f16f4f5c923d39ca798861b4" ), # noqa } - assert list(loader.storage.revision_missing(expected_revs)) == [] + assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=expected_snapshot_id, @@ -554,19 +536,19 @@ ), }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot_id + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) -def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir): +def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") @@ -575,10 +557,10 @@ "snapshot_id": snapshot_id.hex(), } assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=snapshot_id + swh_storage, url, status="full", type="pypi", snapshot=snapshot_id ) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert { "content": 6, @@ -607,7 +589,7 @@ ), }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) actual_load_status2 = loader.load() assert actual_load_status2 == { @@ -616,10 +598,10 @@ } visit_status2 = assert_last_visit_matches( - loader.storage, url, status="full", type="pypi" + swh_storage, url, status="full", type="pypi" ) - stats2 = get_stats(loader.storage) + stats2 = get_stats(swh_storage) expected_stats2 = stats.copy() expected_stats2["origin_visit"] = 1 + 1 assert expected_stats2 == stats2 @@ -628,15 +610,15 @@ assert visit_status2.snapshot == snapshot_id -def test_incremental_visit(swh_config, requests_mock_datadir_visits): +def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = "https://pypi.org/project/0805nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) visit1_actual_load_status = loader.load() - visit1_stats = get_stats(loader.storage) + visit1_stats = get_stats(swh_storage) expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert visit1_actual_load_status == { "status": "eventful", @@ -644,7 +626,7 @@ } assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot_id + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { @@ -663,7 +645,7 @@ del loader._cached_info visit2_actual_load_status = loader.load() - visit2_stats = get_stats(loader.storage) + visit2_stats = get_stats(swh_storage) assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("2e5149a7b0725d18231a37b342e9b7c4e121f283") @@ -673,7 +655,7 @@ } assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) assert { @@ -700,7 +682,7 @@ ], ) - assert list(loader.storage.content_missing_per_sha1(expected_contents)) == [] + assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, @@ -714,7 +696,7 @@ ], ) - assert list(loader.storage.directory_missing(expected_dirs)) == [] + assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { @@ -729,7 +711,7 @@ ), # noqa } - assert list(loader.storage.revision_missing(expected_revs)) == [] + assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=expected_snapshot_id2, @@ -752,10 +734,10 @@ }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot.id + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) urls = [ @@ -778,12 +760,12 @@ # snapshot branch output is different -def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir): +def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/nexter" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6") @@ -805,10 +787,10 @@ ), }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot.id + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) @@ -883,12 +865,12 @@ ) -def test_pypi_artifact_with_no_intrinsic_metadata(swh_config, requests_mock_datadir): +def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ url = "https://pypi.org/project/upymenu" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e") @@ -899,19 +881,19 @@ # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot(id=expected_snapshot_id, branches={}) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot.id + swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) -def test_pypi_origin_not_found(swh_config, requests_mock_datadir): +def test_pypi_origin_not_found(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/unknown" - loader = PyPILoader(url) + loader = PyPILoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( - loader.storage, url, status="not_found", type="pypi", snapshot=None + swh_storage, url, status="not_found", type="pypi", snapshot=None ) diff --git a/swh/loader/package/pypi/tests/test_tasks.py b/swh/loader/package/pypi/tests/test_tasks.py --- a/swh/loader/package/pypi/tests/test_tasks.py +++ b/swh/loader/package/pypi/tests/test_tasks.py @@ -1,14 +1,14 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_pypi_loader( +def test_tasks_pypi_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): - mock_loader = mocker.patch("swh.loader.package.pypi.loader.PyPILoader.load") - mock_loader.return_value = {"status": "eventful"} + mock_load = mocker.patch("swh.loader.package.pypi.loader.PyPILoader.load") + mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.pypi.tasks.LoadPyPI", kwargs=dict(url="some-url") @@ -16,5 +16,5 @@ assert res res.wait() assert res.successful() - + assert mock_load.called assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,11 +25,11 @@ raise ValueError("We refuse to add an origin visit") -def test_loader_origin_visit_failure(swh_config): +def test_loader_origin_visit_failure(swh_storage): """Failure to add origin or origin visit should failed immediately """ - loader = PackageLoader("some-url") + loader = PackageLoader(swh_storage, "some-url") loader.storage = FakeStorage() actual_load_status = loader.load() @@ -84,4 +84,4 @@ with pytest.raises( AssertionError, match="SWH_CONFIG_FILENAME environment variable is undefined" ): - DummyPackageLoader(url="some-url") + DummyPackageLoader.from_configfile(url="some-url") diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py --- a/swh/loader/package/tests/test_loader_metadata.py +++ b/swh/loader/package/tests/test_loader_metadata.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -27,7 +27,6 @@ RevisionType, Sha1Git, ) -from swh.storage import get_storage EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e" FULL_SNAPSHOT_ID = "4a9b608c9f01860a627237dd2409d1d50ec4b054" @@ -138,11 +137,8 @@ return [RawExtrinsicMetadataCore(m.format, m.metadata, m.discovery_date)] -def test_load_artifact_metadata(swh_config, caplog): - storage = get_storage("memory") - - loader = MetadataTestLoader(ORIGIN_URL) - loader.storage = storage +def test_load_artifact_metadata(swh_storage, caplog): + loader = MetadataTestLoader(swh_storage, ORIGIN_URL) load_status = loader.load() assert load_status == { @@ -154,7 +150,7 @@ type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", ) - result = storage.raw_extrinsic_metadata_get( + result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, authority, ) assert result.next_page_token is None @@ -172,11 +168,8 @@ ) -def test_load_metadata(swh_config, caplog): - storage = get_storage("memory") - - loader = MetadataTestLoader(ORIGIN_URL) - loader.storage = storage +def test_load_metadata(swh_storage, caplog): + loader = MetadataTestLoader(swh_storage, ORIGIN_URL) load_status = loader.load() assert load_status == { @@ -184,13 +177,13 @@ "snapshot_id": FULL_SNAPSHOT_ID, } - result = storage.raw_extrinsic_metadata_get( + result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None assert result.results == DIRECTORY_METADATA - result = storage.raw_extrinsic_metadata_get( + result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, ORIGIN_URL, AUTHORITY, ) assert result.next_page_token is None @@ -199,14 +192,8 @@ assert caplog.text == "" -def test_existing_authority(swh_config, caplog): - storage = get_storage("memory") - - loader = MetadataTestLoader(ORIGIN_URL) - loader.storage = storage - loader.config["create_authorities"] = False - - storage.metadata_authority_add([attr.evolve(AUTHORITY, metadata={})]) +def test_existing_authority(swh_storage, caplog): + loader = MetadataTestLoader(swh_storage, ORIGIN_URL) load_status = loader.load() assert load_status == { @@ -214,7 +201,7 @@ "snapshot_id": FULL_SNAPSHOT_ID, } - result = storage.raw_extrinsic_metadata_get( + result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None @@ -223,14 +210,8 @@ assert caplog.text == "" -def test_existing_fetcher(swh_config, caplog): - storage = get_storage("memory") - - loader = MetadataTestLoader(ORIGIN_URL) - loader.storage = storage - loader.config["create_fetchers"] = False - - storage.metadata_fetcher_add([attr.evolve(FETCHER, metadata={})]) +def test_existing_fetcher(swh_storage, caplog): + loader = MetadataTestLoader(swh_storage, ORIGIN_URL) load_status = loader.load() assert load_status == { @@ -238,7 +219,7 @@ "snapshot_id": FULL_SNAPSHOT_ID, } - result = storage.raw_extrinsic_metadata_get( + result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None diff --git a/swh/loader/pytest_plugin.py b/swh/loader/pytest_plugin.py --- a/swh/loader/pytest_plugin.py +++ b/swh/loader/pytest_plugin.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,30 +11,36 @@ @pytest.fixture -def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: +def swh_storage_backend_config(swh_storage_postgresql) -> Dict[str, Any]: return { + "cls": "retry", "storage": { - "cls": "pipeline", - "steps": [ - {"cls": "retry"}, - {"cls": "filter"}, - {"cls": "buffer"}, - { + "cls": "filter", + "storage": { + "cls": "buffer", + "storage": { "cls": "local", "db": swh_storage_postgresql.dsn, "objstorage": {"cls": "memory"}, }, - ], + }, }, } @pytest.fixture -def swh_config(swh_loader_config, monkeypatch, tmp_path): +def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: + return { + "storage": swh_storage_backend_config, + } + + +@pytest.fixture +def swh_config(swh_loader_config, monkeypatch, tmp_path) -> str: conffile = os.path.join(str(tmp_path), "loader.yml") with open(conffile, "w") as f: f.write(yaml.dump(swh_loader_config)) - monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) + monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) return conffile diff --git a/swh/loader/tests/conftest.py b/swh/loader/tests/conftest.py --- a/swh/loader/tests/conftest.py +++ b/swh/loader/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,7 +11,7 @@ @pytest.fixture def swh_loader_config() -> Dict[str, Any]: return { - "storage": {"cls": "pipeline", "steps": [{"cls": "memory",},],}, + "storage": {"cls": "memory",}, "deposit": { "url": "https://deposit.softwareheritage.org/1/private", "auth": {"username": "user", "password": "pass",}, diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py --- a/swh/loader/tests/test_cli.py +++ b/swh/loader/tests/test_cli.py @@ -1,15 +1,18 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime +import os from click.formatting import HelpFormatter from click.testing import CliRunner import pytest +import yaml -from swh.loader.cli import SUPPORTED_LOADERS, get_loader, list, run +from swh.loader.cli import SUPPORTED_LOADERS, get_loader +from swh.loader.cli import loader as loader_cli from swh.loader.package.loader import PackageLoader @@ -23,18 +26,18 @@ get_loader(loader_type, url="db-url") -def test_get_loader(swh_config): +def test_get_loader(swh_loader_config): """Instantiating a supported loader should be ok """ loader_input = { - "archive": {"url": "some-url", "artifacts": [],}, + "archive": {"url": "some-url", "artifacts": []}, "debian": {"url": "some-url", "date": "something", "packages": [],}, - "deposit": {"url": "some-url", "deposit_id": 1,}, "npm": {"url": "https://www.npmjs.com/package/onepackage",}, "pypi": {"url": "some-url",}, } for loader_type, kwargs in loader_input.items(): + kwargs["storage"] = swh_loader_config["storage"] loader = get_loader(loader_type, **kwargs) assert isinstance(loader, PackageLoader) @@ -50,31 +53,44 @@ """ runner = CliRunner() - result = runner.invoke(run, ["-h"]) + + result = runner.invoke(loader_cli, ["run", "-h"]) assert result.exit_code == 0 usage_prefix = _write_usage( - "run", f"[OPTIONS] [{'|'.join(SUPPORTED_LOADERS)}] URL [OPTIONS]..." + "loader", f"run [OPTIONS] [{'|'.join(SUPPORTED_LOADERS)}]\n" ) - expected_help_msg = f"""{usage_prefix} + assert result.output.startswith(usage_prefix) - Ingest with loader the origin located at -Options: - -h, --help Show this message and exit. -""" - assert result.output.startswith(expected_help_msg) +def test_run_with_configuration_failure(tmp_path): + """Triggering a load should fail since configuration is incomplete + + """ + runner = CliRunner() + + conf_path = os.path.join(str(tmp_path), "cli.yml") + with open(conf_path, "w") as f: + f.write(yaml.dump({})) + + with pytest.raises(ValueError, match="Missing storage"): + runner.invoke( + loader_cli, ["-C", conf_path, "run", "pypi", "url=https://some-url",], + catch_exceptions=False + ) def test_run_pypi(mocker, swh_config): """Triggering a load should be ok """ - mock_loader = mocker.patch("swh.loader.package.pypi.loader.PyPILoader") + mock_loader = mocker.patch("swh.loader.package.pypi.loader.PyPILoader.load") runner = CliRunner() - result = runner.invoke(run, ["pypi", "https://some-url"]) + result = runner.invoke( + loader_cli, ["-C", swh_config, "run", "pypi", "url=https://some-url",] + ) assert result.exit_code == 0 - mock_loader.assert_called_once_with(url="https://some-url") # constructor + mock_loader.assert_called_once_with() def test_run_with_visit_date(mocker, swh_config): @@ -86,14 +102,17 @@ runner = CliRunner() input_date = "2016-05-03 15:16:32+00" result = runner.invoke( - run, ["npm", "https://some-url", f"visit_date='{input_date}'"] + loader_cli, ["run", "npm", "https://some-url", f"visit_date='{input_date}'"] ) assert result.exit_code == 0 expected_parsed_date = datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc ) mock_loader.assert_called_once_with( - "npm", url="https://some-url", visit_date=expected_parsed_date + "npm", + storage={"cls": "memory"}, + url="https://some-url", + visit_date=expected_parsed_date, ) @@ -102,12 +121,11 @@ """ runner = CliRunner() - result = runner.invoke(list, ["--help"]) + result = runner.invoke(loader_cli, ["list", "--help"]) assert result.exit_code == 0 - usage_prefix = _write_usage( - "list", f"[OPTIONS] [[{'|'.join(['all'] + SUPPORTED_LOADERS)}]]" - ) + usage_prefix = _write_usage("loader", "list [OPTIONS]\n") expected_help_msg = f"""{usage_prefix} + [[{'|'.join(['all'] + SUPPORTED_LOADERS)}]] List supported loaders and optionally their arguments @@ -122,10 +140,9 @@ """ runner = CliRunner() - result = runner.invoke(list, ["npm"]) + result = runner.invoke(loader_cli, ["list", "npm"]) assert result.exit_code == 0 expected_help_msg = """ Loader: Load npm origin's artifact releases into swh archive. -signature: (url: str) """ assert result.output.startswith(expected_help_msg[1:]) diff --git a/swh/loader/tests/test_init.py b/swh/loader/tests/test_init.py --- a/swh/loader/tests/test_init.py +++ b/swh/loader/tests/test_init.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -163,6 +163,15 @@ ) +@pytest.fixture +def swh_storage_backend_config(swh_storage_postgresql): + return { + "cls": "local", + "db": swh_storage_postgresql.dsn, + "objstorage": {"cls": "memory"}, + } + + @pytest.fixture def mock_storage(mocker): mock_storage = mocker.patch("swh.loader.tests.origin_get_latest_visit_status")