diff --git a/conftest.py b/conftest.py index 0934465..21d6ebd 100644 --- a/conftest.py +++ b/conftest.py @@ -1,66 +1,66 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import yaml from typing import Any, Dict from swh.storage.tests.conftest import * # noqa from swh.scheduler.tests.conftest import * # noqa @pytest.fixture def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: return { 'storage': { 'cls': 'local', 'args': { 'db': swh_storage_postgresql.dsn, 'objstorage': { 'cls': 'memory', 'args': {} }, }, }, 'deposit': { 'url': 'https://deposit.softwareheritage.org/1/private', 'auth': { 'username': 'user', 'password': 'pass', } }, } @pytest.fixture def swh_config(swh_loader_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), 'loader.yml') with open(conffile, 'w') as f: f.write(yaml.dump(swh_loader_config)) monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile @pytest.fixture(autouse=True, scope='session') def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ['http_proxy'] = 'http://localhost:999' os.environ['https_proxy'] = 'http://localhost:999' @pytest.fixture(scope='session') # type: ignore # expected redefinition def celery_includes(): return [ - 'swh.loader.package.tasks', 'swh.loader.package.archive.tasks', 'swh.loader.package.debian.tasks', 'swh.loader.package.deposit.tasks', 'swh.loader.package.npm.tasks', + 'swh.loader.package.pypi.tasks', ] diff --git a/setup.py b/setup.py index bf48200..e88282c 100755 --- a/setup.py +++ b/setup.py @@ -1,72 +1,73 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.core', description='Software Heritage Base Loader', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDBASE', packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, entry_points=''' [swh.workers] loader.archive=swh.loader.package.archive:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register loader.npm=swh.loader.package.npm:register + loader.pypi=swh.loader.package.pypi:register ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-loader-core', }, ) diff --git a/swh/loader/package/tasks.py b/swh/loader/package/pypi/__init__.py similarity index 54% copy from swh/loader/package/tasks.py copy to swh/loader/package/pypi/__init__.py index 15e63ce..d39d5f5 100644 --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/pypi/__init__.py @@ -1,14 +1,14 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from celery import shared_task -from swh.loader.package.pypi import PyPILoader +from typing import Any, Mapping -@shared_task(name=__name__ + '.LoadPyPI') -def load_pypi(*, url=None): - """Load PyPI package""" - return PyPILoader(url).load() +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + return { + 'task_modules': [f'{__name__}.tasks'], + } diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi/loader.py similarity index 100% rename from swh/loader/package/pypi.py rename to swh/loader/package/pypi/loader.py diff --git a/swh/loader/package/tasks.py b/swh/loader/package/pypi/tasks.py similarity index 88% rename from swh/loader/package/tasks.py rename to swh/loader/package/pypi/tasks.py index 15e63ce..748ace4 100644 --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/pypi/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.pypi import PyPILoader +from swh.loader.package.pypi.loader import PyPILoader @shared_task(name=__name__ + '.LoadPyPI') def load_pypi(*, url=None): """Load PyPI package""" return PyPILoader(url).load() diff --git a/swh/loader/package/pypi/tests/__init__.py b/swh/loader/package/pypi/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 b/swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 similarity index 100% rename from swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 rename to swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json b/swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json similarity index 100% rename from swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json rename to swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 b/swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 similarity index 100% rename from swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 rename to swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_nexter_json b/swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json similarity index 100% rename from swh/loader/package/tests/data/https_pypi.org/pypi_nexter_json rename to swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py similarity index 99% rename from swh/loader/package/tests/test_pypi.py rename to swh/loader/package/pypi/tests/test_pypi.py index d16ae66..a4ddcac 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -1,803 +1,803 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from os import path import pytest from unittest.mock import patch from swh.core.tarball import uncompress from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.model.hashutil import hash_to_bytes -from swh.loader.package.pypi import ( +from swh.loader.package.pypi.loader import ( PyPILoader, pypi_api_url, author, extract_intrinsic_metadata, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_api_url_with_slash(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests/') assert url == 'https://pypi.org/pypi/requests/json' @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_metadata == expected_metadata @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path assert extract_intrinsic_metadata('/something-inexistent') == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = path.join(tmp_path, 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa ]) def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: # {visit: partial, status: uneventful, no snapshot} def test_release_with_traceback(swh_config): url = 'https://pypi.org/project/0805nexter' - with patch('swh.loader.package.pypi.PyPILoader.get_default_version', + with patch('swh.loader.package.pypi.loader.PyPILoader.get_default_version', side_effect=ValueError('Problem')): loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa ]) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_revision_metadata_structure(swh_config, requests_mock_datadir): url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'e445da4da22b31bfebb6ffc4383dbf839a074d21') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_missing_artifact( swh_config, requests_mock_datadir_missing_one): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'dd0e4201a232b1c104433741dbf45895b8ac9355' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 3, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' actual_load_status2 = loader.load() assert actual_load_status2 == { 'status': 'uneventful', 'snapshot_id': actual_load_status2['snapshot_id'] } stats2 = get_stats(loader.storage) expected_stats2 = stats.copy() expected_stats2['origin_visit'] = 1 + 1 assert expected_stats2 == stats2 # same snapshot actual_snapshot_id = origin_visit['snapshot'] assert actual_snapshot_id == hash_to_bytes(snapshot_id) def test_incremental_visit(swh_config, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(loader.storage) expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert visit1_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } origin_visit1 = next(loader.storage.origin_visit_get(url)) assert origin_visit1['status'] == 'full' assert origin_visit1['type'] == 'pypi' assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == visit1_stats # Reset internal state loader._info = None visit2_actual_load_status = loader.load() visit2_stats = get_stats(loader.storage) assert visit2_actual_load_status['status'] == 'eventful' expected_snapshot_id2 = '2e5149a7b0725d18231a37b342e9b7c4e121f283' assert visit2_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id2 } visits = list(loader.storage.origin_visit_get(url)) assert len(visits) == 2 assert visits[1]['status'] == 'full' assert visits[1]['type'] == 'pypi' assert { 'content': 6 + 1, # 1 more content 'directory': 4 + 2, # 2 more directories 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': 2 + 1, # 1 more revision 'skipped_content': 0, 'snapshot': 1 + 1, # 1 more snapshot } == visit2_stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', '92689fa2b7fb4d4fc6fb195bf73a50c87c030639' ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', '52604d46843b898f5a43208045d09fcf8731631b', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'releases/1.3.0': { 'target': '51247143b01445c9348afa9edfae31bf7c5d86b1', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.3.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id2, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://files.pythonhosted.org') ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls)) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} # release with multiple sdist artifacts per pypi "version" # snapshot branch output is different def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } expected_branches = { 'releases/1.1.0/nexter-1.1.0.zip': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.1.0/nexter-1.1.0.tar.gz': { 'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_pypi_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } assert artifact_to_revision_id({}, artifact_metadata) is None known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': { 'original_artifact': { 'sha256': 'something-irrelevant', }, }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_pypi_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': { 'sha256': "something-wrong", }, }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_pypi_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha256': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') def test_pypi_artifact_to_revision_id_failures(): with pytest.raises(KeyError, match='sha256'): artifact_metadata = { 'digests': {}, } assert artifact_to_revision_id({}, artifact_metadata) with pytest.raises(KeyError, match='digests'): artifact_metadata = { 'something': 'wrong', } assert artifact_to_revision_id({}, artifact_metadata) diff --git a/swh/loader/package/tests/test_tasks.py b/swh/loader/package/pypi/tests/test_tasks.py similarity index 84% rename from swh/loader/package/tests/test_tasks.py rename to swh/loader/package/pypi/tests/test_tasks.py index 83ac896..d3184d2 100644 --- a/swh/loader/package/tests/test_tasks.py +++ b/swh/loader/package/pypi/tests/test_tasks.py @@ -1,21 +1,21 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -@patch('swh.loader.package.pypi.PyPILoader.load') +@patch('swh.loader.package.pypi.loader.PyPILoader.load') def test_pypi_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( - 'swh.loader.package.tasks.LoadPyPI', + 'swh.loader.package.pypi.tasks.LoadPyPI', (), dict(url='some-url')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'}