diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.75 -swh.model >= 0.0.57 +swh.model >= 0.0.60 swh.scheduler swh.storage >= 0.0.163 diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -17,10 +17,10 @@ from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import ( - release_name, parse_author, swh_author, artifact_identity + release_name, artifact_identity ) from swh.model.model import ( - TimestampWithTimezone, Sha1Git, Revision, RevisionType, + Person, TimestampWithTimezone, Sha1Git, Revision, RevisionType, ) @@ -92,7 +92,7 @@ # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) date = parse_date(metadata.get('Date')) - author = swh_author(parse_author(metadata.get('Maintainer', {}))) + author = Person.from_address(metadata.get('Maintainer', b'')) version = metadata.get('Version', a_metadata['version']) return Revision( message=version.encode('utf-8'), diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -20,7 +20,7 @@ from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import ( - api_info, release_name, parse_author, swh_author + api_info, release_name ) @@ -207,13 +207,12 @@ else: return author_data - author_data: Dict = {} for author_key in ('author', 'authors'): if author_key in package_json: author_str = _author_str(package_json[author_key]) - author_data = parse_author(author_str) + return Person.from_address(author_str) - return swh_author(author_data) + return Person(fullname=b'', name=None, email=None) def _lstrip_bom(s, bom=BOM_UTF8): diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -131,9 +131,9 @@ assert extract_npm_package_author(package_json) == \ Person( - fullname=b'fengmk2 ', + fullname=b'fengmk2 (https://fengmk2.com)', name=b'fengmk2', - email=b'fengmk2@gmail.com' + email=b'fengmk2@gmail.com', ) package_json = json.loads(''' diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -10,7 +10,7 @@ import swh.loader.package from swh.loader.package.utils import ( - download, api_info, release_name, parse_author, artifact_identity + download, api_info, release_name, artifact_identity ) @@ -159,148 +159,6 @@ assert release_name(version, filename) == expected_release -def _parse_author_string_test(author_str, expected_result): - assert parse_author(author_str) == expected_result - assert parse_author(' %s' % author_str) == expected_result - assert parse_author('%s ' % author_str) == expected_result - - -def test_parse_author(): - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - '', - { - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - '(https://john.doe)', - { - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - ' (https://john.doe)', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - '(https://john.doe) ', - { - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe) ', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe (https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test( - 'John Doe(https://john.doe)', - { - 'name': 'John Doe', - 'email': 'john.doe@foo.bar', - 'url': 'https://john.doe' - } - ) - - _parse_author_string_test('', {}) - _parse_author_string_test('<>', {}) - _parse_author_string_test(' <>', {}) - _parse_author_string_test('<>()', {}) - _parse_author_string_test('<> ()', {}) - _parse_author_string_test('()', {}) - _parse_author_string_test(' ()', {}) - - _parse_author_string_test( - 'John Doe <> ()', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe <>', - { - 'name': 'John Doe' - } - ) - - _parse_author_string_test( - 'John Doe ()', - { - 'name': 'John Doe' - } - ) - - def test_artifact_identity(): """Compute primary key should return the right identity diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -7,7 +7,6 @@ import logging import os import requests -import re from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple @@ -23,10 +22,6 @@ DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length']) -# https://github.com/jonschlinkert/author-regex -_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' - - EMPTY_AUTHOR = Person( fullname=b'', name=None, @@ -127,82 +122,6 @@ return 'releases/%s' % version -def parse_author(author_str: str) -> Dict[str, str]: - """ - Parse npm package author string. - - It works with a flexible range of formats, as detailed below:: - - name - name (url) - name (url) - name (url) - name(url) - name (url) - name (url) - name(url) - name(url) - name (url) - name(url) - name - name - (url) - (url) - (url) - (url) - - (url) - - Args: - author_str (str): input author string - - Returns: - dict: A dict that may contain the following keys: - * name - * email - * url - - """ - author = {} - matches = re.findall(_author_regexp, - author_str.replace('<>', '').replace('()', ''), - re.M) - for match in matches: - if match[0].strip(): - author['name'] = match[0].strip() - if match[1].strip(): - author['email'] = match[1].strip() - if match[2].strip(): - author['url'] = match[2].strip() - return author - - -def swh_author(author: Dict[str, str]) -> Person: - """Transform an author like dict to an expected swh like dict (values are - bytes) - - """ - name = author.get('name') - email = author.get('email') - - fullname = None - - if name and email: - fullname = '%s <%s>' % (name, email) - elif name: - fullname = name - - if not fullname: - r = EMPTY_AUTHOR - else: - r = Person( - fullname=fullname.encode('utf-8') if fullname else b'', - name=name.encode('utf-8') if name else None, - email=email.encode('utf-8') if email else None - ) - return r - - def artifact_identity(d: Mapping[str, Any], id_keys: Sequence[str]) -> List[Any]: """Compute the primary key for a dict using the id_keys as primary key