diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -59,6 +59,7 @@ def celery_includes(): return [ 'swh.loader.package.archive.tasks', + 'swh.loader.package.cran.tasks', 'swh.loader.package.debian.tasks', 'swh.loader.package.deposit.tasks', 'swh.loader.package.npm.tasks', diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ iso8601 pkginfo python-debian +python-dateutil diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ entry_points=''' [swh.workers] loader.archive=swh.loader.package.archive:register + loader.cran=swh.loader.package.cran:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register loader.npm=swh.loader.package.npm:register diff --git a/swh/loader/package/cran/__init__.py b/swh/loader/package/cran/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cran/__init__.py @@ -0,0 +1,16 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import CRANLoader + return { + 'task_modules': [f'{__name__}.tasks'], + 'loader': CRANLoader, + } diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cran/loader.py @@ -0,0 +1,160 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import dateutil.parser +import datetime +import os +import logging +import re + +from datetime import timezone +from os import path +from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple + +from debian.deb822 import Deb822 + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import release_name, parse_author, swh_author +from swh.model.identifiers import normalize_timestamp + + +logger = logging.getLogger(__name__) + + +DATE_PATTERN = re.compile(r'^(?P\d{4})-(?P\d{2})$') + + +class CRANLoader(PackageLoader): + visit_type = 'cran' + + def __init__(self, url: str, version: str): + """Loader constructor. + + Args: + url: Origin url to retrieve cran artifact from + version: version of the cran artifact + + """ + super().__init__(url=url) + self.version = version + self.provider_url = url + + def get_versions(self) -> List[str]: + # only 1 artifact + return [self.version] + + def get_default_version(self) -> str: + return self.version + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Dict[str, Any]], None, None]: + p_info = { + 'url': self.url, + 'filename': path.split(self.url)[-1], + 'raw': {} + } + yield release_name(version), p_info + + def build_revision( + self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict[str, Any]: + # a_metadata is empty + metadata = extract_intrinsic_metadata(uncompressed_path) + normalized_date = normalize_timestamp(parse_date(metadata.get('Date'))) + author = swh_author(parse_author(metadata.get('Maintainer', {}))) + version = metadata.get('Version', self.version) + return { + 'message': version.encode('utf-8'), + 'type': 'tar', + 'date': normalized_date, + 'author': author, + 'committer': author, + 'committer_date': normalized_date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'DESCRIPTION', + 'raw': metadata, + }, + 'extrinsic': { + 'provider': self.provider_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } + + +def parse_debian_control(filepath: str) -> Dict[str, Any]: + """Parse debian control at filepath""" + metadata: Dict = {} + logger.debug('Debian control file %s', filepath) + for paragraph in Deb822.iter_paragraphs(open(filepath)): + logger.debug('paragraph: %s', paragraph) + metadata.update(**paragraph) + + logger.debug('metadata parsed: %s', metadata) + return metadata + + +def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]: + """Given an uncompressed path holding the DESCRIPTION file, returns a + DESCRIPTION parsed structure as a dict. + + Cran origins describes their intrinsic metadata within a DESCRIPTION file + at the root tree of a tarball. This DESCRIPTION uses a simple file format + called DCF, the Debian control format. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + dir_path (str): Path to the uncompressed directory + representing a release artifact from pypi. + + Returns: + the DESCRIPTION parsed structure as a dict (or empty dict if missing) + + """ + # Retrieve the root folder of the archive + if not os.path.exists(dir_path): + return {} + lst = os.listdir(dir_path) + if len(lst) != 1: + return {} + project_dirname = lst[0] + description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION') + if not os.path.exists(description_path): + return {} + return parse_debian_control(description_path) + + +def parse_date(date: Optional[str]) -> Optional[datetime.datetime]: + """Parse a date into a datetime + + """ + assert not date or isinstance(date, str) + dt: Optional[datetime.datetime] = None + if not date: + return dt + try: + specific_date = DATE_PATTERN.match(date) + if specific_date: + year = int(specific_date.group('year')) + month = int(specific_date.group('month')) + dt = datetime.datetime(year, month, 1) + else: + dt = dateutil.parser.parse(date) + + if not dt.tzinfo: + # up for discussion the timezone needs to be set or + # normalize_timestamp is not happy: ValueError: normalize_timestamp + # received datetime without timezone: 2001-06-08 00:00:00 + dt = dt.replace(tzinfo=timezone.utc) + except Exception as e: + logger.warning('Fail to parse date %s. Reason: %s', (date, e)) + return dt diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cran/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.cran.loader import CRANLoader + + +@shared_task(name=__name__ + '.LoadCran') +def load_cran(url=None, version=None): + """Load archive's artifacts (e.g gnu, etc...)""" + return CRANLoader(url, version).load() diff --git a/swh/loader/package/cran/tests/__init__.py b/swh/loader/package/cran/tests/__init__.py new file mode 100644 diff --git a/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@.', # noqa + 'Maintainer': 'Brian Ripley ', + 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa + 'License': 'Unlimited use and distribution (see LICENCE).', + 'URL': 'http://www.biostat.harvard.edu/~mwand' + } + + assert actual_metadata == expected_metadata + + +@pytest.mark.fs +def test_extract_intrinsic_metadata_failures(tmp_path): + """Parsing inexistent path/archive/PKG-INFO yield None""" + # inexistent first level path + assert extract_intrinsic_metadata('/something-inexistent') == {} + # inexistent second level path (as expected by pypi archives) + assert extract_intrinsic_metadata(tmp_path) == {} + # inexistent PKG-INFO within second level path + existing_path_no_pkginfo = str(tmp_path / 'something') + os.mkdir(existing_path_no_pkginfo) + assert extract_intrinsic_metadata(tmp_path) == {} + + +def test_cran_one_visit(swh_config, requests_mock_datadir): + version = '2.22-6' + base_url = 'https://cran.r-project.org' + url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa + loader = CRANLoader(url, version=version) + + actual_load_status = loader.load() + + expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + + expected_snapshot = { + 'id': expected_snapshot_id, + 'branches': { + 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, + f'releases/{version}': { + 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', + 'target_type': 'revision' + } + } + } + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + assert origin_visit['type'] == 'cran' + + visit_stats = get_stats(loader.storage) + assert { + 'content': 33, + 'directory': 7, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == visit_stats + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith(base_url) + ] + # visited each artifact once across 2 visits + assert len(urls) == 1 diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cran/tests/test_tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config): + mock_loader = mocker.patch( + 'swh.loader.package.cran.loader.CRANLoader.load') + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.cran.tasks.LoadCran', + (), dict(url='some-url', version='1.2.3')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -6,7 +6,6 @@ import json import logging import os -import re from codecs import BOM_UTF8 from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional @@ -17,18 +16,14 @@ from urllib.parse import quote from swh.model.identifiers import normalize_timestamp from swh.loader.package.loader import PackageLoader -from swh.loader.package.utils import api_info, release_name +from swh.loader.package.utils import ( + api_info, release_name, parse_author, swh_author +) logger = logging.getLogger(__name__) -_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} - -# https://github.com/jonschlinkert/author-regex -_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' - - class NpmLoader(PackageLoader): """Load npm origin's artifact releases into swh archive. @@ -162,56 +157,6 @@ return None -def parse_npm_package_author(author_str): - """ - Parse npm package author string. - - It works with a flexible range of formats, as detailed below:: - - name - name (url) - name (url) - name (url) - name(url) - name (url) - name (url) - name(url) - name(url) - name (url) - name(url) - name - name - (url) - (url) - (url) - (url) - - (url) - - Args: - author_str (str): input author string - - Returns: - dict: A dict that may contain the following keys: - * name - * email - * url - - """ - author = {} - matches = re.findall(_author_regexp, - author_str.replace('<>', '').replace('()', ''), - re.M) - for match in matches: - if match[0].strip(): - author['name'] = match[0].strip() - if match[1].strip(): - author['email'] = match[1].strip() - if match[2].strip(): - author['url'] = match[2].strip() - return author - - def extract_npm_package_author(package_json): """ Extract package author from a ``package.json`` file content and @@ -246,31 +191,9 @@ for author_key in ('author', 'authors'): if author_key in package_json: author_str = _author_str(package_json[author_key]) - author_data = parse_npm_package_author(author_str) - - name = author_data.get('name') - email = author_data.get('email') - - fullname = None - - if name and email: - fullname = '%s <%s>' % (name, email) - elif name: - fullname = name - - if not fullname: - return _EMPTY_AUTHOR - - if fullname: - fullname = fullname.encode('utf-8') - - if name: - name = name.encode('utf-8') - - if email: - email = email.encode('utf-8') + author_data = parse_author(author_str) - return {'fullname': fullname, 'name': name, 'email': email} + return swh_author(author_data) def _lstrip_bom(s, bom=BOM_UTF8): diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -10,7 +10,7 @@ from swh.model.hashutil import hash_to_bytes from swh.loader.package.npm.loader import ( - NpmLoader, parse_npm_package_author, extract_npm_package_author, + NpmLoader, extract_npm_package_author, artifact_to_revision_id ) from swh.loader.package.tests.common import ( @@ -18,148 +18,6 @@ ) -def _parse_author_string_test(author_str, expected_result): - assert parse_npm_package_author(author_str) == expected_result - assert parse_npm_package_author(' %s' % author_str) == expected_result - assert parse_npm_package_author('%s ' % author_str) == expected_result - - -def test_parse_npm_package_author(): - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - '', - { - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - '(https://john.doe)', - { - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - ' (https://john.doe)', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - '(https://john.doe) ', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe) ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test('', {}) - _parse_author_string_test('<>', {}) - _parse_author_string_test(' <>', {}) - _parse_author_string_test('<>()', {}) - _parse_author_string_test('<> ()', {}) - _parse_author_string_test('()', {}) - _parse_author_string_test(' ()', {}) - - _parse_author_string_test( - 'John Doe <> ()', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe <>', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe ()', - { - 'name': 'John Doe' - } - ) - - def test_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, 'https_replicate.npmjs.com', 'org_visit1') diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -9,7 +9,9 @@ import swh.loader.package -from swh.loader.package.utils import download, api_info, release_name +from swh.loader.package.utils import ( + download, api_info, release_name, parse_author +) def test_version_generation(): @@ -155,3 +157,151 @@ ('0.0.1', None, 'releases/0.0.1'), ('0.0.2', 'something', 'releases/0.0.2/something')]: assert release_name(version, filename) == expected_release + + +def _parse_author_string_test(author_str, expected_result): + assert parse_author(author_str) == expected_result + assert parse_author(' %s' % author_str) == expected_result + assert parse_author('%s ' % author_str) == expected_result + + +def test_parse_author(): + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + '', + { + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + '(https://john.doe)', + { + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + ' (https://john.doe)', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + '(https://john.doe) ', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe) ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test('', {}) + _parse_author_string_test('<>', {}) + _parse_author_string_test(' <>', {}) + _parse_author_string_test('<>()', {}) + _parse_author_string_test('<> ()', {}) + _parse_author_string_test('()', {}) + _parse_author_string_test(' ()', {}) + + _parse_author_string_test( + 'John Doe <> ()', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe <>', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe ()', + { + 'name': 'John Doe' + } + ) + + +# def test_swh_author(): +# for author, expected_author in [ +# ({}, ) +# ]: diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -7,6 +7,7 @@ import logging import os import requests +import re from typing import Dict, Optional, Tuple @@ -20,6 +21,13 @@ DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length']) +# https://github.com/jonschlinkert/author-regex +_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' + + +_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} + + def api_info(url: str) -> Dict: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. @@ -111,3 +119,79 @@ if filename: return 'releases/%s/%s' % (version, filename) return 'releases/%s' % version + + +def parse_author(author_str: str) -> Dict[str, str]: + """ + Parse npm package author string. + + It works with a flexible range of formats, as detailed below:: + + name + name (url) + name (url) + name (url) + name(url) + name (url) + name (url) + name(url) + name(url) + name (url) + name(url) + name + name + (url) + (url) + (url) + (url) + + (url) + + Args: + author_str (str): input author string + + Returns: + dict: A dict that may contain the following keys: + * name + * email + * url + + """ + author = {} + matches = re.findall(_author_regexp, + author_str.replace('<>', '').replace('()', ''), + re.M) + for match in matches: + if match[0].strip(): + author['name'] = match[0].strip() + if match[1].strip(): + author['email'] = match[1].strip() + if match[2].strip(): + author['url'] = match[2].strip() + return author + + +def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]: + """Transform an author like dict to an expected swh like dict (values are + bytes) + + """ + name = author.get('name') + email = author.get('email') + + fullname = None + + if name and email: + fullname = '%s <%s>' % (name, email) + elif name: + fullname = name + + if not fullname: + r = _EMPTY_AUTHOR + else: + r = { + 'fullname': fullname.encode('utf-8') if fullname else None, + 'name': name.encode('utf-8') if name else None, + 'email': email.encode('utf-8') if email else None + } + return r diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py --- a/swh/loader/tests/test_cli.py +++ b/swh/loader/tests/test_cli.py @@ -60,7 +60,7 @@ result = runner.invoke(run, ['-h']) assert result.exit_code == 0 - expected_help_msg = """Usage: run [OPTIONS] [archive|debian|deposit|npm|pypi] URL [OPTIONS]... + expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|npm|pypi] URL [OPTIONS]... Ingest with loader the origin located at @@ -89,7 +89,7 @@ runner = CliRunner() result = runner.invoke(list, ['--help']) assert result.exit_code == 0 - expected_help_msg = """Usage: list [OPTIONS] [[all|archive|debian|deposit|npm|pypi]] + expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|npm|pypi]] List supported loaders and optionally their arguments