diff --git a/conftest.py b/conftest.py index 21d6ebd..3375fe2 100644 --- a/conftest.py +++ b/conftest.py @@ -1,66 +1,67 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import yaml from typing import Any, Dict from swh.storage.tests.conftest import * # noqa from swh.scheduler.tests.conftest import * # noqa @pytest.fixture def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: return { 'storage': { 'cls': 'local', 'args': { 'db': swh_storage_postgresql.dsn, 'objstorage': { 'cls': 'memory', 'args': {} }, }, }, 'deposit': { 'url': 'https://deposit.softwareheritage.org/1/private', 'auth': { 'username': 'user', 'password': 'pass', } }, } @pytest.fixture def swh_config(swh_loader_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), 'loader.yml') with open(conffile, 'w') as f: f.write(yaml.dump(swh_loader_config)) monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile @pytest.fixture(autouse=True, scope='session') def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ['http_proxy'] = 'http://localhost:999' os.environ['https_proxy'] = 'http://localhost:999' @pytest.fixture(scope='session') # type: ignore # expected redefinition def celery_includes(): return [ 'swh.loader.package.archive.tasks', + 'swh.loader.package.cran.tasks', 'swh.loader.package.debian.tasks', 'swh.loader.package.deposit.tasks', 'swh.loader.package.npm.tasks', 'swh.loader.package.pypi.tasks', ] diff --git a/requirements.txt b/requirements.txt index 35eb9f7..ade368b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner retrying psutil requests iso8601 pkginfo python-debian +python-dateutil diff --git a/setup.py b/setup.py index 2ad98e6..d491c79 100755 --- a/setup.py +++ b/setup.py @@ -1,75 +1,76 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.core', description='Software Heritage Base Loader', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDBASE', packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, entry_points=''' [swh.workers] loader.archive=swh.loader.package.archive:register + loader.cran=swh.loader.package.cran:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register loader.npm=swh.loader.package.npm:register loader.pypi=swh.loader.package.pypi:register [swh.cli.subcommands] loader=swh.loader.cli:loader ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-loader-core', }, ) diff --git a/swh/loader/package/cran/__init__.py b/swh/loader/package/cran/__init__.py new file mode 100644 index 0000000..cb8bd97 --- /dev/null +++ b/swh/loader/package/cran/__init__.py @@ -0,0 +1,16 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import CRANLoader + return { + 'task_modules': [f'{__name__}.tasks'], + 'loader': CRANLoader, + } diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py new file mode 100644 index 0000000..1ed24f8 --- /dev/null +++ b/swh/loader/package/cran/loader.py @@ -0,0 +1,160 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import dateutil.parser +import datetime +import os +import logging +import re + +from datetime import timezone +from os import path +from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple + +from debian.deb822 import Deb822 + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import release_name, parse_author, swh_author +from swh.model.identifiers import normalize_timestamp + + +logger = logging.getLogger(__name__) + + +DATE_PATTERN = re.compile(r'^(?P\d{4})-(?P\d{2})$') + + +class CRANLoader(PackageLoader): + visit_type = 'cran' + + def __init__(self, url: str, version: str): + """Loader constructor. + + Args: + url: Origin url to retrieve cran artifact from + version: version of the cran artifact + + """ + super().__init__(url=url) + self.version = version + self.provider_url = url + + def get_versions(self) -> List[str]: + # only 1 artifact + return [self.version] + + def get_default_version(self) -> str: + return self.version + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Dict[str, Any]], None, None]: + p_info = { + 'url': self.url, + 'filename': path.split(self.url)[-1], + 'raw': {} + } + yield release_name(version), p_info + + def build_revision( + self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict[str, Any]: + # a_metadata is empty + metadata = extract_intrinsic_metadata(uncompressed_path) + normalized_date = normalize_timestamp(parse_date(metadata.get('Date'))) + author = swh_author(parse_author(metadata.get('Maintainer', {}))) + version = metadata.get('Version', self.version) + return { + 'message': version.encode('utf-8'), + 'type': 'tar', + 'date': normalized_date, + 'author': author, + 'committer': author, + 'committer_date': normalized_date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'DESCRIPTION', + 'raw': metadata, + }, + 'extrinsic': { + 'provider': self.provider_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } + + +def parse_debian_control(filepath: str) -> Dict[str, Any]: + """Parse debian control at filepath""" + metadata: Dict = {} + logger.debug('Debian control file %s', filepath) + for paragraph in Deb822.iter_paragraphs(open(filepath)): + logger.debug('paragraph: %s', paragraph) + metadata.update(**paragraph) + + logger.debug('metadata parsed: %s', metadata) + return metadata + + +def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]: + """Given an uncompressed path holding the DESCRIPTION file, returns a + DESCRIPTION parsed structure as a dict. + + Cran origins describes their intrinsic metadata within a DESCRIPTION file + at the root tree of a tarball. This DESCRIPTION uses a simple file format + called DCF, the Debian control format. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + dir_path (str): Path to the uncompressed directory + representing a release artifact from pypi. + + Returns: + the DESCRIPTION parsed structure as a dict (or empty dict if missing) + + """ + # Retrieve the root folder of the archive + if not os.path.exists(dir_path): + return {} + lst = os.listdir(dir_path) + if len(lst) != 1: + return {} + project_dirname = lst[0] + description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION') + if not os.path.exists(description_path): + return {} + return parse_debian_control(description_path) + + +def parse_date(date: Optional[str]) -> Optional[datetime.datetime]: + """Parse a date into a datetime + + """ + assert not date or isinstance(date, str) + dt: Optional[datetime.datetime] = None + if not date: + return dt + try: + specific_date = DATE_PATTERN.match(date) + if specific_date: + year = int(specific_date.group('year')) + month = int(specific_date.group('month')) + dt = datetime.datetime(year, month, 1) + else: + dt = dateutil.parser.parse(date) + + if not dt.tzinfo: + # up for discussion the timezone needs to be set or + # normalize_timestamp is not happy: ValueError: normalize_timestamp + # received datetime without timezone: 2001-06-08 00:00:00 + dt = dt.replace(tzinfo=timezone.utc) + except Exception as e: + logger.warning('Fail to parse date %s. Reason: %s', (date, e)) + return dt diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py new file mode 100644 index 0000000..64ba6e5 --- /dev/null +++ b/swh/loader/package/cran/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.cran.loader import CRANLoader + + +@shared_task(name=__name__ + '.LoadCran') +def load_cran(url=None, version=None): + """Load archive's artifacts (e.g gnu, etc...)""" + return CRANLoader(url, version).load() diff --git a/swh/loader/package/cran/tests/__init__.py b/swh/loader/package/cran/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz new file mode 100644 index 0000000..52d1037 Binary files /dev/null and b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz differ diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py new file mode 100644 index 0000000..fabd1c4 --- /dev/null +++ b/swh/loader/package/cran/tests/test_cran.py @@ -0,0 +1,198 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import pytest + +from datetime import datetime, timezone +from dateutil.tz import tzlocal + +from os import path + +from swh.loader.package.cran.loader import ( + extract_intrinsic_metadata, CRANLoader, parse_date +) +from swh.core.tarball import uncompress + +from swh.loader.package.tests.common import ( + check_snapshot, get_stats +) + + +def test_cran_parse_date(): + data = [ + # parsable, some have debatable results though + ('2001-June-08', + datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)), + ('Tue Dec 27 15:06:08 PST 2011', + datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)), + ('8-14-2013', + datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)), + ('2011-01', + datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)), + ('201109', + datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)), + ('04-12-2014', + datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)), + ('2018-08-24, 10:40:10', + datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)), + ('2013-October-16', + datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)), + ('Aug 23, 2013', + datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)), + ('27-11-2014', + datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)), + ('2019-09-26,', + datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)), + ('9/25/2014', + datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)), + ('Fri Jun 27 17:23:53 2014', + datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)), + ('28-04-2014', + datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)), + ('04-14-2014', + datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)), + ('2019-05-08 14:17:31 UTC', + datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)), + ('Wed May 21 13:50:39 CEST 2014', + datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())), + ('2018-04-10 00:01:04 KST', + datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)), + ('2019-08-25 10:45', + datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)), + ('March 9, 2015', + datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)), + ('Aug. 18, 2012', + datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)), + ('2014-Dec-17', + datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)), + ('March 01, 2013', + datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)), + ('2017-04-08.', + datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)), + ('2014-Apr-22', + datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)), + ('Mon Jan 12 19:54:04 2015', + datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)), + ('May 22, 2014', + datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)), + ('2014-08-12 09:55:10 EDT', + datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)), + # unparsable + ('Fabruary 21, 2012', None), + ('2019-05-28"', None), + ('2017-03-01 today', None), + ('2016-11-0110.1093/icesjms/fsw182', None), + ('2019-07-010', None), + ('2015-02.23', None), + ('20013-12-30', None), + ('2016-08-017', None), + ('2019-02-07l', None), + ('2018-05-010', None), + ('2019-09-27 KST', None), + ('$Date$', None), + ('2019-09-27 KST', None), + ('2019-06-22 $Date$', None), + ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None), + ('2015-7-013', None), + ('2018-05-023', None), + ("Check NEWS file for changes: news(package='simSummary')", None) + ] + for date, expected_date in data: + actual_date = parse_date(date) + assert actual_date == expected_date, f'input date to parse {date}' + + +@pytest.mark.fs +def test_extract_intrinsic_metadata(tmp_path, datadir): + """Parsing existing archive's PKG-INFO should yield results""" + uncompressed_archive_path = str(tmp_path) + # sample url + # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa + archive_path = path.join( + datadir, 'https_cran.r-project.org', + 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz') + uncompress(archive_path, dest=uncompressed_archive_path) + + actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) + + expected_metadata = { + 'Package': 'KernSmooth', + 'Priority': 'recommended', + 'Version': '2.22-6', + 'Date': '2001-June-08', + 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)', + 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley .', # noqa + 'Maintainer': 'Brian Ripley ', + 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa + 'License': 'Unlimited use and distribution (see LICENCE).', + 'URL': 'http://www.biostat.harvard.edu/~mwand' + } + + assert actual_metadata == expected_metadata + + +@pytest.mark.fs +def test_extract_intrinsic_metadata_failures(tmp_path): + """Parsing inexistent path/archive/PKG-INFO yield None""" + # inexistent first level path + assert extract_intrinsic_metadata('/something-inexistent') == {} + # inexistent second level path (as expected by pypi archives) + assert extract_intrinsic_metadata(tmp_path) == {} + # inexistent PKG-INFO within second level path + existing_path_no_pkginfo = str(tmp_path / 'something') + os.mkdir(existing_path_no_pkginfo) + assert extract_intrinsic_metadata(tmp_path) == {} + + +def test_cran_one_visit(swh_config, requests_mock_datadir): + version = '2.22-6' + base_url = 'https://cran.r-project.org' + url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa + loader = CRANLoader(url, version=version) + + actual_load_status = loader.load() + + expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + + expected_snapshot = { + 'id': expected_snapshot_id, + 'branches': { + 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, + f'releases/{version}': { + 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', + 'target_type': 'revision' + } + } + } + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + assert origin_visit['type'] == 'cran' + + visit_stats = get_stats(loader.storage) + assert { + 'content': 33, + 'directory': 7, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == visit_stats + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith(base_url) + ] + # visited each artifact once across 2 visits + assert len(urls) == 1 diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py new file mode 100644 index 0000000..4406375 --- /dev/null +++ b/swh/loader/package/cran/tests/test_tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config): + mock_loader = mocker.patch( + 'swh.loader.package.cran.loader.CRANLoader.load') + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.cran.tasks.LoadCran', + (), dict(url='some-url', version='1.2.3')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index 2569946..6ae7669 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,344 +1,267 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os -import re from codecs import BOM_UTF8 from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional import chardet import iso8601 from urllib.parse import quote from swh.model.identifiers import normalize_timestamp from swh.loader.package.loader import PackageLoader -from swh.loader.package.utils import api_info, release_name +from swh.loader.package.utils import ( + api_info, release_name, parse_author, swh_author +) logger = logging.getLogger(__name__) -_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} - -# https://github.com/jonschlinkert/author-regex -_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' - - class NpmLoader(PackageLoader): """Load npm origin's artifact releases into swh archive. """ visit_type = 'npm' def __init__(self, url: str): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(url=url) package_name = url.split('https://www.npmjs.com/package/')[1] safe_name = quote(package_name, safe='') self.provider_url = f'https://replicate.npmjs.com/{safe_name}/' self._info: Dict[str, Any] = {} self._versions = None @property def info(self) -> Dict[str, Any]: """Return the project metadata information (fetched from npm registry) """ if not self._info: self._info = api_info(self.provider_url) return self._info def get_versions(self) -> Sequence[str]: return sorted(list(self.info['versions'].keys())) def get_default_version(self) -> str: return self.info['dist-tags'].get('latest', '') def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: meta = self.info['versions'][version] url = meta['dist']['tarball'] p_info = { 'url': url, 'filename': os.path.basename(url), 'raw': meta, } yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: i_metadata = extract_intrinsic_metadata(uncompressed_path) # from intrinsic metadata author = extract_npm_package_author(i_metadata) message = i_metadata['version'].encode('ascii') # from extrinsic metadata # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = a_metadata['version'] date = self.info['time'][extrinsic_version] date = iso8601.parse_date(date) date = normalize_timestamp(int(date.timestamp())) return { 'type': 'tar', 'message': message, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'parents': [], 'metadata': { 'intrinsic': { 'tool': 'package.json', 'raw': i_metadata, }, 'extrinsic': { 'provider': self.provider_url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }, } def artifact_to_revision_id( known_artifacts: Dict, artifact_metadata: Dict) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. The following code allows to deal with 2 metadata formats: - old format sample: { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } - new format sample: { 'original_artifact': [{ 'checksums': { 'sha256': "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa ... }, }], ... } """ shasum = artifact_metadata['dist']['shasum'] for rev_id, known_artifact in known_artifacts.items(): known_original_artifact = known_artifact.get('original_artifact') if not known_original_artifact: # previous loader-npm version kept original artifact elsewhere known_original_artifact = known_artifact.get('package_source') if not known_original_artifact: continue original_hash = known_original_artifact['sha1'] else: assert isinstance(known_original_artifact, list) original_hash = known_original_artifact[0]['checksums']['sha1'] if shasum == original_hash: return rev_id return None -def parse_npm_package_author(author_str): - """ - Parse npm package author string. - - It works with a flexible range of formats, as detailed below:: - - name - name (url) - name (url) - name (url) - name(url) - name (url) - name (url) - name(url) - name(url) - name (url) - name(url) - name - name - (url) - (url) - (url) - (url) - - (url) - - Args: - author_str (str): input author string - - Returns: - dict: A dict that may contain the following keys: - * name - * email - * url - - """ - author = {} - matches = re.findall(_author_regexp, - author_str.replace('<>', '').replace('()', ''), - re.M) - for match in matches: - if match[0].strip(): - author['name'] = match[0].strip() - if match[1].strip(): - author['email'] = match[1].strip() - if match[2].strip(): - author['url'] = match[2].strip() - return author - - def extract_npm_package_author(package_json): """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json (dict): Dict holding the content of parsed ``package.json`` file Returns: dict: A dict with the following keys: * fullname * name * email """ def _author_str(author_data): if type(author_data) is dict: author_str = '' if 'name' in author_data: author_str += author_data['name'] if 'email' in author_data: author_str += ' <%s>' % author_data['email'] return author_str elif type(author_data) is list: return _author_str(author_data[0]) if len(author_data) > 0 else '' else: return author_data author_data = {} for author_key in ('author', 'authors'): if author_key in package_json: author_str = _author_str(package_json[author_key]) - author_data = parse_npm_package_author(author_str) - - name = author_data.get('name') - email = author_data.get('email') - - fullname = None - - if name and email: - fullname = '%s <%s>' % (name, email) - elif name: - fullname = name - - if not fullname: - return _EMPTY_AUTHOR - - if fullname: - fullname = fullname.encode('utf-8') - - if name: - name = name.encode('utf-8') - - if email: - email = email.encode('utf-8') + author_data = parse_author(author_str) - return {'fullname': fullname, 'name': name, 'email': email} + return swh_author(author_data) def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom):] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode('utf-8') except UnicodeDecodeError: encoding = chardet.detect(json_bytes)['encoding'] if encoding: json_str = json_bytes.decode(encoding, 'replace') try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, 'package.json') if not os.path.exists(package_json_path): return {} with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index b3bb0dd..dc5438f 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,653 +1,511 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import pytest from swh.model.hashutil import hash_to_bytes from swh.loader.package.npm.loader import ( - NpmLoader, parse_npm_package_author, extract_npm_package_author, + NpmLoader, extract_npm_package_author, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) -def _parse_author_string_test(author_str, expected_result): - assert parse_npm_package_author(author_str) == expected_result - assert parse_npm_package_author(' %s' % author_str) == expected_result - assert parse_npm_package_author('%s ' % author_str) == expected_result - - -def test_parse_npm_package_author(): - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - '', - { - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - '(https://john.doe)', - { - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - ' (https://john.doe)', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - '(https://john.doe) ', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe) ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test('', {}) - _parse_author_string_test('<>', {}) - _parse_author_string_test(' <>', {}) - _parse_author_string_test('<>()', {}) - _parse_author_string_test('<> ()', {}) - _parse_author_string_test('()', {}) - _parse_author_string_test(' ()', {}) - - _parse_author_string_test( - 'John Doe <> ()', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe <>', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe ()', - { - 'name': 'John Doe' - } - ) - - def test_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, 'https_replicate.npmjs.com', 'org_visit1') with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ { 'fullname': b'mooz ', 'name': b'mooz', 'email': b'stillpedant@gmail.com' } assert ( extract_npm_package_author(package_metadata['versions']['0.0.3']) == { 'fullname': b'Masafumi Oyamada ', 'name': b'Masafumi Oyamada', 'email': b'stillpedant@gmail.com' } ) package_json = json.loads(''' { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }''') # noqa assert extract_npm_package_author(package_json) == \ { 'fullname': b'Yauheni Pakala ', 'name': b'Yauheni Pakala', 'email': b'evgeniy.pakalo@gmail.com' } package_json = json.loads(''' { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'Shawn Walsh', 'name': b'Shawn Walsh', 'email': None } package_json = json.loads(''' { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'fengmk2 ', 'name': b'fengmk2', 'email': b'fengmk2@gmail.com' } package_json = json.loads(''' { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'xiaohuoni <448627663@qq.com>', 'name': b'xiaohuoni', 'email': b'448627663@qq.com' } def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes([ '4ce3058e16ab3d7e077f65aabf855c34895bf17c', '858c3ceee84c8311adc808f8cdb30d233ddc9d18', '0fa33b4f5a4e0496da6843a38ff1af8b61541996', '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', '9163ac8025923d5a45aaac482262893955c9b37b', '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', '18c03aac6d3e910efb20039c15d70ab5e0297101', '41265c42446aac17ca769e67d1704f99e5a1394d', '783ff33f5882813dca9239452c4a7cadd4dba778', 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', '112d1900b4c2e3e9351050d1b542c9744f9793f3', '5439bbc4bd9a996f1a38244e6892b71850bc98fd', 'd83097a2f994b503185adf4e719d154123150159', 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', 'b3523a26f7147e4af40d9d462adaae6d49eda13e', 'cd065fb435d6fb204a8871bcd623d0d0e673088c', '2854a40855ad839a54f4b08f5cff0cf52fca4399', 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', '585fc5caab9ead178a327d3660d35851db713df1', 'e8cd41a48d79101977e3036a87aeb1aac730686f', '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', '9c3cc2763bf9e9e37067d3607302c4776502df98', '3649a68410e354c83cd4a38b66bd314de4c8f5c9', 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', '38de737da99514de6559ff163c988198bc91367a', ]) _expected_new_directories_first_visit = normalize_hashes([ '3370d20d6f96dc1c9e50f083e2134881db110f4f', '42753c0c2ab00c4501b552ac4671c68f3cf5aece', 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', '80579be563e2ef3e385226fe7a3f079b377f142c', '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'bcad03ce58ac136f26f000990fc9064e559fe1c0', '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', '584b5b4b6cf7f038095e820b99386a9c232de931', '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', 'bb5f4ee143c970367eb409f2e4c1104898048b9d', '1b95491047add1103db0dfdfa84a9735dcb11e88', 'a00c6de13471a2d66e64aca140ddb21ef5521e62', '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', '775cc516543be86c15c1dc172f49c0d4e6e78235', 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', ]) _expected_new_revisions_first_visit = normalize_hashes({ 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': '42753c0c2ab00c4501b552ac4671c68f3cf5aece', '5f9eb78af37ffd12949f235e86fac04898f9f72a': '3370d20d6f96dc1c9e50f083e2134881db110f4f', 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} ) def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def test_revision_metadata_structure(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'd8a1c7474d2956ac598a19f0f27d52f7015f117e') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_npm_loader_first_visit(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() expected_snapshot_id = 'd0587e1195aed5a8800411a008f2f2d627f18e2d' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats assert len(list(loader.storage.content_get( _expected_new_contents_first_visit))) == len( _expected_new_contents_first_visit) assert list(loader.storage.directory_missing( _expected_new_directories_first_visit)) == [] assert list(loader.storage.revision_missing( _expected_new_revisions_first_visit)) == [] expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': { 'target': 'releases/0.0.4', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) def test_npm_loader_incremental_visit( swh_config, requests_mock_datadir_visits): package = 'org' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats loader._info = None # reset loader internal state actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'eventful' snap_id2 = actual_load_status2['snapshot_id'] assert snap_id2 is not None assert snap_id2 != actual_load_status['snapshot_id'] origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'npm' stats = get_stats(loader.storage) assert { # 3 new releases artifacts 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 15, 'origin': 1, 'origin_visit': 2, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 3, 'skipped_content': 0, 'snapshot': 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://registry.npmjs.org') ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures('requests_mock_datadir') def test_npm_loader_version_divergence(swh_config): package = '@aller_shared' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { # 1 new releases artifacts 'content': 534, 'directory': 153, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1, } == stats expected_snapshot = { 'id': 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0' }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '845673bfe8cbd31b1eaf757745a964137e6f9116', }, 'releases/0.1.1-alpha.14': { 'target_type': 'revision', 'target': '05181c12cd8c22035dd31155656826b85745da37', }, }, } check_snapshot(expected_snapshot, loader.storage) def test_npm_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', }, } known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': {}, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_npm_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'package_source': { 'sha1': "something-wrong" } }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_npm_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha1': "05181c12cd8c22035dd31155656826b85745da37" }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha1': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index 549defc..63024b8 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,157 +1,307 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import swh.loader.package -from swh.loader.package.utils import download, api_info, release_name +from swh.loader.package.utils import ( + download, api_info, release_name, parse_author +) def test_version_generation(): assert swh.loader.package.__version__ != 'devel', \ "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = 'https://pypi.org/pypi/arrow/json' status_code = 404 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e: download(url, tmp_path) assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa assert (actual_hashes['checksums']['sha256'] == '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data) # no header information actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa assert (actual_hashes['checksums']['sha256'] == '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) # good hashes for such file good = { 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa } actual_filepath, actual_hashes = download(url, dest=str(tmp_path), hashes=good) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == good['sha1'] assert actual_hashes['checksums']['sha256'] == good['sha256'] @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise """ filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) # good hashes for such file good = { 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace('1', '0') expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = ("Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo] )) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code ) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/pypi/requests/json' requests_mock.get(url, text='{"version": "0.0.1"}') actual_info = api_info(url) assert actual_info == { 'version': '0.0.1', } def test_release_name(): for version, filename, expected_release in [ ('0.0.1', None, 'releases/0.0.1'), ('0.0.2', 'something', 'releases/0.0.2/something')]: assert release_name(version, filename) == expected_release + + +def _parse_author_string_test(author_str, expected_result): + assert parse_author(author_str) == expected_result + assert parse_author(' %s' % author_str) == expected_result + assert parse_author('%s ' % author_str) == expected_result + + +def test_parse_author(): + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + '', + { + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + '(https://john.doe)', + { + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + ' (https://john.doe)', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + '(https://john.doe) ', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe) ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test('', {}) + _parse_author_string_test('<>', {}) + _parse_author_string_test(' <>', {}) + _parse_author_string_test('<>()', {}) + _parse_author_string_test('<> ()', {}) + _parse_author_string_test('()', {}) + _parse_author_string_test(' ()', {}) + + _parse_author_string_test( + 'John Doe <> ()', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe <>', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe ()', + { + 'name': 'John Doe' + } + ) + + +# def test_swh_author(): +# for author, expected_author in [ +# ({}, ) +# ]: diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index 14330dd..9f78a35 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,113 +1,197 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import os import requests +import re from typing import Dict, Optional, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.loader.package import DEFAULT_PARAMS logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length']) +# https://github.com/jonschlinkert/author-regex +_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' + + +_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} + + def api_info(url: str) -> Dict: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: ValueError in case of query failures (for some reasons: 404, ...) Returns: The associated response's information dict """ response = requests.get(url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response.json() def download(url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params['auth'] = auth response = requests.get(url, **params, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) filename = filename if filename else os.path.basename(url) logger.debug('filename: %s', filename) filepath = os.path.join(dest, filename) logger.debug('filepath: %s', filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( 'Failure when fetching %s. ' 'Checksum mismatched: %s != %s' % ( url, expected_digest, actual_digest)) computed_hashes = h.hexdigest() length = computed_hashes.pop('length') extrinsic_metadata = { 'length': length, 'filename': filename, 'checksums': computed_hashes, } logger.debug('extrinsic_metadata', extrinsic_metadata) return filepath, extrinsic_metadata def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return 'releases/%s/%s' % (version, filename) return 'releases/%s' % version + + +def parse_author(author_str: str) -> Dict[str, str]: + """ + Parse npm package author string. + + It works with a flexible range of formats, as detailed below:: + + name + name (url) + name (url) + name (url) + name(url) + name (url) + name (url) + name(url) + name(url) + name (url) + name(url) + name + name + (url) + (url) + (url) + (url) + + (url) + + Args: + author_str (str): input author string + + Returns: + dict: A dict that may contain the following keys: + * name + * email + * url + + """ + author = {} + matches = re.findall(_author_regexp, + author_str.replace('<>', '').replace('()', ''), + re.M) + for match in matches: + if match[0].strip(): + author['name'] = match[0].strip() + if match[1].strip(): + author['email'] = match[1].strip() + if match[2].strip(): + author['url'] = match[2].strip() + return author + + +def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]: + """Transform an author like dict to an expected swh like dict (values are + bytes) + + """ + name = author.get('name') + email = author.get('email') + + fullname = None + + if name and email: + fullname = '%s <%s>' % (name, email) + elif name: + fullname = name + + if not fullname: + r = _EMPTY_AUTHOR + else: + r = { + 'fullname': fullname.encode('utf-8') if fullname else None, + 'name': name.encode('utf-8') if name else None, + 'email': email.encode('utf-8') if email else None + } + return r diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py index f29ef9e..3fb4bd5 100644 --- a/swh/loader/tests/test_cli.py +++ b/swh/loader/tests/test_cli.py @@ -1,112 +1,112 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.loader.cli import run, list, get_loader, SUPPORTED_LOADERS from swh.loader.package.loader import PackageLoader from click.testing import CliRunner def test_get_loader_wrong_input(swh_config): """Unsupported loader should raise """ loader_type = 'unknown' assert loader_type not in SUPPORTED_LOADERS with pytest.raises(ValueError, match='Invalid loader'): get_loader(loader_type, url='db-url') def test_get_loader(swh_config): """Instantiating a supported loader should be ok """ loader_input = { 'archive': { 'url': 'some-url', 'artifacts': [], }, 'debian': { 'url': 'some-url', 'date': 'something', 'packages': [], }, 'deposit': { 'url': 'some-url', 'deposit_id': 1, }, 'npm': { 'url': 'https://www.npmjs.com/package/onepackage', }, 'pypi': { 'url': 'some-url', }, } for loader_type, kwargs in loader_input.items(): loader = get_loader(loader_type, **kwargs) assert isinstance(loader, PackageLoader) def test_run_help(swh_config): """Help message should be ok """ runner = CliRunner() result = runner.invoke(run, ['-h']) assert result.exit_code == 0 - expected_help_msg = """Usage: run [OPTIONS] [archive|debian|deposit|npm|pypi] URL [OPTIONS]... + expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|npm|pypi] URL [OPTIONS]... Ingest with loader the origin located at Options: -h, --help Show this message and exit. """ # noqa assert result.output.startswith(expected_help_msg) def test_run_pypi(mocker, swh_config): """Triggering a load should be ok """ mock_loader = mocker.patch('swh.loader.package.pypi.loader.PyPILoader') runner = CliRunner() result = runner.invoke(run, ['pypi', 'https://some-url']) assert result.exit_code == 0 mock_loader.assert_called_once_with(url='https://some-url') # constructor def test_list_help(mocker, swh_config): """Triggering a load should be ok """ runner = CliRunner() result = runner.invoke(list, ['--help']) assert result.exit_code == 0 - expected_help_msg = """Usage: list [OPTIONS] [[all|archive|debian|deposit|npm|pypi]] + expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|npm|pypi]] List supported loaders and optionally their arguments Options: -h, --help Show this message and exit. """ # noqa assert result.output.startswith(expected_help_msg) def test_list_help_npm(mocker, swh_config): """Triggering a load should be ok """ runner = CliRunner() result = runner.invoke(list, ['npm']) assert result.exit_code == 0 expected_help_msg = '''Loader: Load npm origin's artifact releases into swh archive. signature: (url: str) ''' # noqa assert result.output.startswith(expected_help_msg)