diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..48d0962 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Antoine Eiche \ No newline at end of file diff --git a/conftest.py b/conftest.py index 50822c3..b669dfa 100644 --- a/conftest.py +++ b/conftest.py @@ -1,75 +1,76 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import yaml from typing import Any, Dict from swh.storage.tests.conftest import * # noqa from swh.scheduler.tests.conftest import * # noqa @pytest.fixture def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: return { 'storage': { 'cls': 'pipeline', 'steps': [ {'cls': 'retry'}, {'cls': 'filter'}, {'cls': 'buffer'}, { 'cls': 'local', 'args': { 'db': swh_storage_postgresql.dsn, 'objstorage': { 'cls': 'memory', 'args': {} }, } } ] }, 'deposit': { 'url': 'https://deposit.softwareheritage.org/1/private', 'auth': { 'username': 'user', 'password': 'pass', } }, } @pytest.fixture def swh_config(swh_loader_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), 'loader.yml') with open(conffile, 'w') as f: f.write(yaml.dump(swh_loader_config)) monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile @pytest.fixture(autouse=True, scope='session') def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ['http_proxy'] = 'http://localhost:999' os.environ['https_proxy'] = 'http://localhost:999' @pytest.fixture(scope='session') # type: ignore # expected redefinition def celery_includes(): return [ 'swh.loader.package.archive.tasks', 'swh.loader.package.cran.tasks', 'swh.loader.package.debian.tasks', 'swh.loader.package.deposit.tasks', 'swh.loader.package.npm.tasks', 'swh.loader.package.pypi.tasks', + 'swh.loader.package.functional.tasks', ] diff --git a/setup.py b/setup.py index d491c79..ee74525 100755 --- a/setup.py +++ b/setup.py @@ -1,76 +1,77 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.core', description='Software Heritage Base Loader', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDBASE', packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, entry_points=''' [swh.workers] loader.archive=swh.loader.package.archive:register loader.cran=swh.loader.package.cran:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register + loader.functional=swh.loader.package.functional:register loader.npm=swh.loader.package.npm:register loader.pypi=swh.loader.package.pypi:register [swh.cli.subcommands] loader=swh.loader.cli:loader ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-loader-core', }, ) diff --git a/swh/loader/package/functional/__init__.py b/swh/loader/package/functional/__init__.py new file mode 100644 index 0000000..3bf7da1 --- /dev/null +++ b/swh/loader/package/functional/__init__.py @@ -0,0 +1,16 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import FunctionalLoader + return { + 'task_modules': [f'{__name__}.tasks'], + 'loader': FunctionalLoader, + } diff --git a/swh/loader/package/functional/loader.py b/swh/loader/package/functional/loader.py new file mode 100644 index 0000000..1d861fd --- /dev/null +++ b/swh/loader/package/functional/loader.py @@ -0,0 +1,85 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import requests + +from typing import Dict, Optional, Any + +from swh.model.model import ( + Sha1Git, Revision, RevisionType +) + +from swh.loader.package.utils import EMPTY_AUTHOR + +from swh.loader.package.loader import PackageLoader + + +def retrieve_sources(url: str) -> Dict[str, Any]: + response = requests.get(url, + allow_redirects=True) + if response.status_code != 200: + raise ValueError("Got %d HTTP code on %s", + response.status_code, url) + + return json.loads(response.content.decode('utf-8')) + + +class FunctionalLoader(PackageLoader): + """Load sources from a sources.json file. This loader is used to load + sources used by functional package manager (eg. Nix and Guix). + + """ + visit_type = 'functional' + + def __init__(self, url): + super().__init__(url=url) + self.sources = retrieve_sources(url)['sources'] + self.provider_url = url + + # Note: this could be renamed get_artifacts in the PackageLoader + # base class. + def get_versions(self): + # TODO: try all mirrors and not only the first one. A source + # can be fetched from several urls, called mirrors. We + # currently only use the first one, but if the first one + # fails, we should try the second one and so on. + return [s['url'][0] for s in self.sources] + + # Note: this could be renamed get_artifact_info in the PackageLoader + # base class. + def get_package_info(self, source): + # TODO: we need to provide the sha256 of the source also + yield source, {'url': source, 'raw': {'url': source}} + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + for rev_id, known_artifact in known_artifacts.items(): + known_url = known_artifact['extrinsic']['raw']['url'] + if artifact_metadata['url'] == known_url: + return rev_id + return None + + def build_revision(self, a_metadata: Dict, uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: + return Revision( + type=RevisionType.TAR, + message=b'', + author=EMPTY_AUTHOR, + date=None, + committer=EMPTY_AUTHOR, + committer_date=None, + parents=[], + directory=directory, + synthetic=True, + metadata={ + 'extrinsic': { + 'provider': self.provider_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + } + ) diff --git a/swh/loader/package/functional/tasks.py b/swh/loader/package/functional/tasks.py new file mode 100644 index 0000000..35b47e5 --- /dev/null +++ b/swh/loader/package/functional/tasks.py @@ -0,0 +1,16 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.functional.loader import ( + FunctionalLoader +) + + +@shared_task(name=__name__ + '.LoadFunctional') +def load_functional(*, url=None): + """Load functional (e.g. guix/nix) package""" + return FunctionalLoader(url).load() diff --git a/swh/loader/package/functional/tests/data/https_example.com/file.txt b/swh/loader/package/functional/tests/data/https_example.com/file.txt new file mode 100644 index 0000000..d95f3ad --- /dev/null +++ b/swh/loader/package/functional/tests/data/https_example.com/file.txt @@ -0,0 +1 @@ +content diff --git a/swh/loader/package/functional/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz b/swh/loader/package/functional/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz new file mode 100644 index 0000000..0ead277 Binary files /dev/null and b/swh/loader/package/functional/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz differ diff --git a/swh/loader/package/functional/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz b/swh/loader/package/functional/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz new file mode 100644 index 0000000..8b47ea3 Binary files /dev/null and b/swh/loader/package/functional/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz differ diff --git a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json new file mode 100644 index 0000000..3395d01 --- /dev/null +++ b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json @@ -0,0 +1,13 @@ +{ + "sources": [ + { + "type": "url", + "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] + }, + { + "type": "url", + "url": [ "https://example.com/file.txt" ] + } + ], + "version": 1 +} diff --git a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 new file mode 100644 index 0000000..127b7e2 --- /dev/null +++ b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 @@ -0,0 +1,17 @@ +{ + "sources": [ + { + "type": "url", + "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] + }, + { + "type": "url", + "url": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ] + }, + { + "type": "url", + "url": [ "https://example.com/file.txt" ] + } + ], + "version": 1 +} diff --git a/swh/loader/package/functional/tests/test_functional.py b/swh/loader/package/functional/tests/test_functional.py new file mode 100644 index 0000000..91a19ec --- /dev/null +++ b/swh/loader/package/functional/tests/test_functional.py @@ -0,0 +1,216 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +from json.decoder import JSONDecodeError + +from swh.loader.package.functional.loader import ( + FunctionalLoader, retrieve_sources +) + +from swh.loader.package.tests.common import ( + get_stats, check_snapshot +) + +sources_url = 'https://nix-community.github.io/nixpkgs-swh/sources.json' + + +def test_retrieve_sources(swh_config, requests_mock_datadir): + j = retrieve_sources(sources_url) + assert "sources" in j.keys() + assert len(j["sources"]) == 2 + + +def test_retrieve_non_existing(swh_config, requests_mock_datadir): + with pytest.raises(ValueError): + FunctionalLoader('https://non-existing-url') + + +def test_retrieve_non_json(swh_config, requests_mock_datadir): + with pytest.raises(JSONDecodeError): + FunctionalLoader('https://example.com/file.txt') + + +def test_loader_one_visit(swh_config, requests_mock_datadir): + loader = FunctionalLoader(sources_url) + res = loader.load() + assert res['status'] == 'eventful' + + stats = get_stats(loader.storage) + assert { + 'content': 1, + 'directory': 3, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + origin_visit = next(loader.storage.origin_visit_get(sources_url)) + # The visit is partial because urls pointing to non tarball file + # are not handled yet + assert origin_visit['status'] == 'partial' + assert origin_visit['type'] == 'functional' + + +def test_uncompress_failure(swh_config, requests_mock_datadir): + """Non tarball files are currently not supported and the uncompress + function fails on such kind of files. + + However, even in this case of failure (because of the url + https://example.com/file.txt), a snapshot and a visit has to be + created (with a status partial since all files are not archived). + + """ + loader = FunctionalLoader(sources_url) + loader_status = loader.load() + + urls = [s['url'][0] for s in loader.sources] + assert "https://example.com/file.txt" in urls + assert loader_status['status'] == 'eventful' + + origin_visit = next(loader.storage.origin_visit_get(sources_url)) + # The visit is partial because urls pointing to non tarball files + # are not handled yet + assert origin_visit['status'] == 'partial' + + +def test_loader_incremental(swh_config, requests_mock_datadir): + """Ensure a second visit do not download artifact already + downloaded by the previous visit. + + """ + loader = FunctionalLoader(sources_url) + load_status = loader.load() + + loader = FunctionalLoader(sources_url) + loader.load() + expected_snapshot_id = '2c7f01ef3115f7999a013979fa27bfa12dcb63eb' + assert load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + expected_branches = { + 'https://github.com/owner-1/repository-1/revision-1.tgz': { + 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', + 'target_type': 'revision' + }, + } + expected_snapshot = { + 'id': expected_snapshot_id, + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, storage=loader.storage) + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url == ('https://github.com/owner-1/repository-1/revision-1.tgz') + ] + # The artifact + # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only + # visited one time + assert len(urls) == 1 + + +def test_loader_two_visits(swh_config, requests_mock_datadir_visits): + """To ensure there is only one origin, but two visits, two revisions + and two snapshots are created. + + The first visit creates a snapshot containing one tarball. The + second visit creates a snapshot containing the same tarball and + another tarball. + + """ + loader = FunctionalLoader(sources_url) + load_status = loader.load() + expected_snapshot_id = '2c7f01ef3115f7999a013979fa27bfa12dcb63eb' + assert load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + + expected_branches = { + 'https://github.com/owner-1/repository-1/revision-1.tgz': { + 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', + 'target_type': 'revision' + } + } + + expected_snapshot = { + 'id': expected_snapshot_id, + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, storage=loader.storage) + + stats = get_stats(loader.storage) + assert { + 'content': 1, + 'directory': 3, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + loader = FunctionalLoader(sources_url) + load_status = loader.load() + expected_snapshot_id = '9c4fbfd991b35c7de876cd66bcda2967a8f476ac' + assert load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + + # This ensures visits are incremental. Indeed, if we request a + # second time an url, because of the requests_mock_datadir_visits + # fixture, the file has to end with `_visit1`. + expected_branches = { + 'https://github.com/owner-1/repository-1/revision-1.tgz': { + 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', + 'target_type': 'revision' + }, + 'https://github.com/owner-2/repository-1/revision-1.tgz': { + 'target': '85e0bad74e33e390aaeb74f139853ae3863ee544', + 'target_type': 'revision' + } + } + + expected_snapshot = { + 'id': expected_snapshot_id, + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, storage=loader.storage) + + stats = get_stats(loader.storage) + assert { + 'content': 2, + 'directory': 5, + 'origin': 1, + 'origin_visit': 2, + 'person': 1, + 'release': 0, + 'revision': 2, + 'skipped_content': 0, + 'snapshot': 2 + } == stats + + +def test_resolve_revision_from(swh_config, requests_mock_datadir): + loader = FunctionalLoader(sources_url) + + known_artifacts = { + 'id1': {'extrinsic': {'raw': {'url': "url1"}}}, + 'id2': {'extrinsic': {'raw': {'url': "url2"}}} + } + + metadata = {'url': 'url1'} + assert loader.resolve_revision_from(known_artifacts, metadata) == 'id1' + metadata = {'url': 'url3'} + assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa diff --git a/swh/loader/package/functional/tests/test_tasks.py b/swh/loader/package/functional/tests/test_tasks.py new file mode 100644 index 0000000..d204bd0 --- /dev/null +++ b/swh/loader/package/functional/tests/test_tasks.py @@ -0,0 +1,26 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_functional_loader(mocker, swh_app, celery_session_worker, swh_config): + mock_loader = mocker.patch( + 'swh.loader.package.functional.loader.FunctionalLoader.load') + mock_loader.return_value = {'status': 'eventful'} + + mock_retrieve_sources = mocker.patch( + 'swh.loader.package.functional.loader.retrieve_sources') + mock_retrieve_sources.return_value = { + 'sources': [], + 'revision': 'some-revision' + } + + res = swh_app.send_task( + 'swh.loader.package.functional.tasks.LoadFunctional', + kwargs=dict(url='some-url')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py index 3fb4bd5..f50b60c 100644 --- a/swh/loader/tests/test_cli.py +++ b/swh/loader/tests/test_cli.py @@ -1,112 +1,113 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.loader.cli import run, list, get_loader, SUPPORTED_LOADERS from swh.loader.package.loader import PackageLoader from click.testing import CliRunner def test_get_loader_wrong_input(swh_config): """Unsupported loader should raise """ loader_type = 'unknown' assert loader_type not in SUPPORTED_LOADERS with pytest.raises(ValueError, match='Invalid loader'): get_loader(loader_type, url='db-url') def test_get_loader(swh_config): """Instantiating a supported loader should be ok """ loader_input = { 'archive': { 'url': 'some-url', 'artifacts': [], }, 'debian': { 'url': 'some-url', 'date': 'something', 'packages': [], }, 'deposit': { 'url': 'some-url', 'deposit_id': 1, }, 'npm': { 'url': 'https://www.npmjs.com/package/onepackage', }, 'pypi': { 'url': 'some-url', }, } for loader_type, kwargs in loader_input.items(): loader = get_loader(loader_type, **kwargs) assert isinstance(loader, PackageLoader) def test_run_help(swh_config): """Help message should be ok """ runner = CliRunner() result = runner.invoke(run, ['-h']) assert result.exit_code == 0 - expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|npm|pypi] URL [OPTIONS]... + expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|functional|npm|pypi] URL + [OPTIONS]... Ingest with loader the origin located at Options: -h, --help Show this message and exit. """ # noqa assert result.output.startswith(expected_help_msg) def test_run_pypi(mocker, swh_config): """Triggering a load should be ok """ mock_loader = mocker.patch('swh.loader.package.pypi.loader.PyPILoader') runner = CliRunner() result = runner.invoke(run, ['pypi', 'https://some-url']) assert result.exit_code == 0 mock_loader.assert_called_once_with(url='https://some-url') # constructor def test_list_help(mocker, swh_config): """Triggering a load should be ok """ runner = CliRunner() result = runner.invoke(list, ['--help']) assert result.exit_code == 0 - expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|npm|pypi]] + expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|functional|npm|pypi]] List supported loaders and optionally their arguments Options: -h, --help Show this message and exit. """ # noqa assert result.output.startswith(expected_help_msg) def test_list_help_npm(mocker, swh_config): """Triggering a load should be ok """ runner = CliRunner() result = runner.invoke(list, ['npm']) assert result.exit_code == 0 expected_help_msg = '''Loader: Load npm origin's artifact releases into swh archive. signature: (url: str) ''' # noqa assert result.output.startswith(expected_help_msg)