Page MenuHomeSoftware Heritage

D2463.id8866.diff
No OneTemporary

D2463.id8866.diff

diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -59,6 +59,7 @@
def celery_includes():
return [
'swh.loader.package.archive.tasks',
+ 'swh.loader.package.cran.tasks',
'swh.loader.package.debian.tasks',
'swh.loader.package.deposit.tasks',
'swh.loader.package.npm.tasks',
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@
iso8601
pkginfo
python-debian
+python-dateutil
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@
entry_points='''
[swh.workers]
loader.archive=swh.loader.package.archive:register
+ loader.cran=swh.loader.package.cran:register
loader.debian=swh.loader.package.debian:register
loader.deposit=swh.loader.package.deposit:register
loader.npm=swh.loader.package.npm:register
diff --git a/swh/loader/package/cran/__init__.py b/swh/loader/package/cran/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from typing import Any, Mapping
+
+
+def register() -> Mapping[str, Any]:
+ """Register the current worker module's definition"""
+ from .loader import CRANLoader
+ return {
+ 'task_modules': [f'{__name__}.tasks'],
+ 'loader': CRANLoader,
+ }
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/loader.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import dateutil.parser
+import datetime
+import os
+import logging
+import re
+
+from datetime import timezone
+from os import path
+from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple
+
+from debian.deb822 import Deb822
+
+from swh.loader.package.loader import PackageLoader
+from swh.loader.package.utils import release_name, parse_author, swh_author
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+DATE_PATTERN = re.compile(r'^(?P<year>\d{4})-(?P<month>\d{2})$')
+
+
+class CRANLoader(PackageLoader):
+ visit_type = 'cran'
+
+ def __init__(self, url: str, version: str):
+ """Loader constructor.
+
+ Args:
+ url: Origin url to retrieve cran artifact from
+ version: version of the cran artifact
+
+ """
+ super().__init__(url=url)
+ self.version = version
+ self.provider_url = url
+
+ def get_versions(self) -> List[str]:
+ # only 1 artifact
+ return [self.version]
+
+ def get_default_version(self) -> str:
+ return self.version
+
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Dict[str, Any]], None, None]:
+ p_info = {
+ 'url': self.url,
+ 'filename': path.split(self.url)[-1],
+ 'raw': {}
+ }
+ yield release_name(version), p_info
+
+ def build_revision(
+ self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict[str, Any]:
+ # a_metadata is empty
+ metadata = extract_intrinsic_metadata(uncompressed_path)
+ normalized_date = normalize_timestamp(parse_date(metadata.get('Date')))
+ author = swh_author(parse_author(metadata.get('Maintainer', {})))
+ version = metadata.get('Version', self.version)
+ return {
+ 'message': version.encode('utf-8'),
+ 'type': 'tar',
+ 'date': normalized_date,
+ 'author': author,
+ 'committer': author,
+ 'committer_date': normalized_date,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {
+ 'tool': 'DESCRIPTION',
+ 'raw': metadata,
+ },
+ 'extrinsic': {
+ 'provider': self.provider_url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
+
+
+def parse_debian_control(filepath: str) -> Dict[str, Any]:
+ """Parse debian control at filepath"""
+ metadata: Dict = {}
+ logger.debug('Debian control file %s', filepath)
+ for paragraph in Deb822.iter_paragraphs(open(filepath)):
+ logger.debug('paragraph: %s', paragraph)
+ metadata.update(**paragraph)
+
+ logger.debug('metadata parsed: %s', metadata)
+ return metadata
+
+
+def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]:
+ """Given an uncompressed path holding the DESCRIPTION file, returns a
+ DESCRIPTION parsed structure as a dict.
+
+ Cran origins describes their intrinsic metadata within a DESCRIPTION file
+ at the root tree of a tarball. This DESCRIPTION uses a simple file format
+ called DCF, the Debian control format.
+
+ The release artifact contains at their root one folder. For example:
+ $ tar tvf zprint-0.0.6.tar.gz
+ drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
+ ...
+
+ Args:
+ dir_path (str): Path to the uncompressed directory
+ representing a release artifact from pypi.
+
+ Returns:
+ the DESCRIPTION parsed structure as a dict (or empty dict if missing)
+
+ """
+ # Retrieve the root folder of the archive
+ if not os.path.exists(dir_path):
+ return {}
+ lst = os.listdir(dir_path)
+ if len(lst) != 1:
+ return {}
+ project_dirname = lst[0]
+ description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION')
+ if not os.path.exists(description_path):
+ return {}
+ return parse_debian_control(description_path)
+
+
+def parse_date(date: Optional[str]) -> Optional[datetime.datetime]:
+ """Parse a date into a datetime
+
+ """
+ assert not date or isinstance(date, str)
+ dt: Optional[datetime.datetime] = None
+ if not date:
+ return dt
+ try:
+ specific_date = DATE_PATTERN.match(date)
+ if specific_date:
+ year = int(specific_date.group('year'))
+ month = int(specific_date.group('month'))
+ dt = datetime.datetime(year, month, 1)
+ else:
+ dt = dateutil.parser.parse(date)
+
+ if not dt.tzinfo:
+ # up for discussion the timezone needs to be set or
+ # normalize_timestamp is not happy: ValueError: normalize_timestamp
+ # received datetime without timezone: 2001-06-08 00:00:00
+ dt = dt.replace(tzinfo=timezone.utc)
+ except Exception as e:
+ logger.warning('Fail to parse date %s. Reason: %s', (date, e))
+ return dt
diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tasks.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.loader.package.cran.loader import CRANLoader
+
+
+@shared_task(name=__name__ + '.LoadCran')
+def load_cran(url=None, version=None):
+ """Load archive's artifacts (e.g gnu, etc...)"""
+ return CRANLoader(url, version).load()
diff --git a/swh/loader/package/cran/tests/__init__.py b/swh/loader/package/cran/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tests/test_cran.py
@@ -0,0 +1,198 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import pytest
+
+from datetime import datetime, timezone
+from dateutil.tz import tzlocal
+
+from os import path
+
+from swh.loader.package.cran.loader import (
+ extract_intrinsic_metadata, CRANLoader, parse_date
+)
+from swh.core.tarball import uncompress
+
+from swh.loader.package.tests.common import (
+ check_snapshot, get_stats
+)
+
+
+def test_cran_parse_date():
+ data = [
+ # parsable, some have debatable results though
+ ('2001-June-08',
+ datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)),
+ ('Tue Dec 27 15:06:08 PST 2011',
+ datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)),
+ ('8-14-2013',
+ datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)),
+ ('2011-01',
+ datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)),
+ ('201109',
+ datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)),
+ ('04-12-2014',
+ datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)),
+ ('2018-08-24, 10:40:10',
+ datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)),
+ ('2013-October-16',
+ datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)),
+ ('Aug 23, 2013',
+ datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)),
+ ('27-11-2014',
+ datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)),
+ ('2019-09-26,',
+ datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)),
+ ('9/25/2014',
+ datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)),
+ ('Fri Jun 27 17:23:53 2014',
+ datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)),
+ ('28-04-2014',
+ datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)),
+ ('04-14-2014',
+ datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)),
+ ('2019-05-08 14:17:31 UTC',
+ datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)),
+ ('Wed May 21 13:50:39 CEST 2014',
+ datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())),
+ ('2018-04-10 00:01:04 KST',
+ datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)),
+ ('2019-08-25 10:45',
+ datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)),
+ ('March 9, 2015',
+ datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)),
+ ('Aug. 18, 2012',
+ datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)),
+ ('2014-Dec-17',
+ datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)),
+ ('March 01, 2013',
+ datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)),
+ ('2017-04-08.',
+ datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)),
+ ('2014-Apr-22',
+ datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)),
+ ('Mon Jan 12 19:54:04 2015',
+ datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)),
+ ('May 22, 2014',
+ datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)),
+ ('2014-08-12 09:55:10 EDT',
+ datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)),
+ # unparsable
+ ('Fabruary 21, 2012', None),
+ ('2019-05-28"', None),
+ ('2017-03-01 today', None),
+ ('2016-11-0110.1093/icesjms/fsw182', None),
+ ('2019-07-010', None),
+ ('2015-02.23', None),
+ ('20013-12-30', None),
+ ('2016-08-017', None),
+ ('2019-02-07l', None),
+ ('2018-05-010', None),
+ ('2019-09-27 KST', None),
+ ('$Date$', None),
+ ('2019-09-27 KST', None),
+ ('2019-06-22 $Date$', None),
+ ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None),
+ ('2015-7-013', None),
+ ('2018-05-023', None),
+ ("Check NEWS file for changes: news(package='simSummary')", None)
+ ]
+ for date, expected_date in data:
+ actual_date = parse_date(date)
+ assert actual_date == expected_date, f'input date to parse {date}'
+
+
+@pytest.mark.fs
+def test_extract_intrinsic_metadata(tmp_path, datadir):
+ """Parsing existing archive's PKG-INFO should yield results"""
+ uncompressed_archive_path = str(tmp_path)
+ # sample url
+ # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa
+ archive_path = path.join(
+ datadir, 'https_cran.r-project.org',
+ 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz')
+ uncompress(archive_path, dest=uncompressed_archive_path)
+
+ actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path)
+
+ expected_metadata = {
+ 'Package': 'KernSmooth',
+ 'Priority': 'recommended',
+ 'Version': '2.22-6',
+ 'Date': '2001-June-08',
+ 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)',
+ 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley <ripley@stats.ox.ac.uk>.', # noqa
+ 'Maintainer': 'Brian Ripley <ripley@stats.ox.ac.uk>',
+ 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa
+ 'License': 'Unlimited use and distribution (see LICENCE).',
+ 'URL': 'http://www.biostat.harvard.edu/~mwand'
+ }
+
+ assert actual_metadata == expected_metadata
+
+
+@pytest.mark.fs
+def test_extract_intrinsic_metadata_failures(tmp_path):
+ """Parsing inexistent path/archive/PKG-INFO yield None"""
+ # inexistent first level path
+ assert extract_intrinsic_metadata('/something-inexistent') == {}
+ # inexistent second level path (as expected by pypi archives)
+ assert extract_intrinsic_metadata(tmp_path) == {}
+ # inexistent PKG-INFO within second level path
+ existing_path_no_pkginfo = str(tmp_path / 'something')
+ os.mkdir(existing_path_no_pkginfo)
+ assert extract_intrinsic_metadata(tmp_path) == {}
+
+
+def test_cran_one_visit(swh_config, requests_mock_datadir):
+ version = '2.22-6'
+ base_url = 'https://cran.r-project.org'
+ url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa
+ loader = CRANLoader(url, version=version)
+
+ actual_load_status = loader.load()
+
+ expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21'
+ assert actual_load_status == {
+ 'status': 'eventful',
+ 'snapshot_id': expected_snapshot_id
+ }
+
+ expected_snapshot = {
+ 'id': expected_snapshot_id,
+ 'branches': {
+ 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'},
+ f'releases/{version}': {
+ 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603',
+ 'target_type': 'revision'
+ }
+ }
+ }
+ check_snapshot(expected_snapshot, loader.storage)
+
+ origin_visit = next(loader.storage.origin_visit_get(url))
+ assert origin_visit['status'] == 'full'
+ assert origin_visit['type'] == 'cran'
+
+ visit_stats = get_stats(loader.storage)
+ assert {
+ 'content': 33,
+ 'directory': 7,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 1,
+ 'release': 0,
+ 'revision': 1,
+ 'skipped_content': 0,
+ 'snapshot': 1
+ } == visit_stats
+
+ urls = [
+ m.url for m in requests_mock_datadir.request_history
+ if m.url.startswith(base_url)
+ ]
+ # visited each artifact once across 2 visits
+ assert len(urls) == 1
diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tests/test_tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config):
+ mock_loader = mocker.patch(
+ 'swh.loader.package.cran.loader.CRANLoader.load')
+ mock_loader.return_value = {'status': 'eventful'}
+
+ res = swh_app.send_task(
+ 'swh.loader.package.cran.tasks.LoadCran',
+ (), dict(url='some-url', version='1.2.3'))
+ assert res
+ res.wait()
+ assert res.successful()
+
+ assert res.result == {'status': 'eventful'}
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -6,7 +6,6 @@
import json
import logging
import os
-import re
from codecs import BOM_UTF8
from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
@@ -17,18 +16,14 @@
from urllib.parse import quote
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import api_info, release_name
+from swh.loader.package.utils import (
+ api_info, release_name, parse_author, swh_author
+)
logger = logging.getLogger(__name__)
-_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
-
-# https://github.com/jonschlinkert/author-regex
-_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
-
-
class NpmLoader(PackageLoader):
"""Load npm origin's artifact releases into swh archive.
@@ -162,56 +157,6 @@
return None
-def parse_npm_package_author(author_str):
- """
- Parse npm package author string.
-
- It works with a flexible range of formats, as detailed below::
-
- name
- name <email> (url)
- name <email>(url)
- name<email> (url)
- name<email>(url)
- name (url) <email>
- name (url)<email>
- name(url) <email>
- name(url)<email>
- name (url)
- name(url)
- name <email>
- name<email>
- <email> (url)
- <email>(url)
- (url) <email>
- (url)<email>
- <email>
- (url)
-
- Args:
- author_str (str): input author string
-
- Returns:
- dict: A dict that may contain the following keys:
- * name
- * email
- * url
-
- """
- author = {}
- matches = re.findall(_author_regexp,
- author_str.replace('<>', '').replace('()', ''),
- re.M)
- for match in matches:
- if match[0].strip():
- author['name'] = match[0].strip()
- if match[1].strip():
- author['email'] = match[1].strip()
- if match[2].strip():
- author['url'] = match[2].strip()
- return author
-
-
def extract_npm_package_author(package_json):
"""
Extract package author from a ``package.json`` file content and
@@ -246,31 +191,9 @@
for author_key in ('author', 'authors'):
if author_key in package_json:
author_str = _author_str(package_json[author_key])
- author_data = parse_npm_package_author(author_str)
-
- name = author_data.get('name')
- email = author_data.get('email')
-
- fullname = None
-
- if name and email:
- fullname = '%s <%s>' % (name, email)
- elif name:
- fullname = name
-
- if not fullname:
- return _EMPTY_AUTHOR
-
- if fullname:
- fullname = fullname.encode('utf-8')
-
- if name:
- name = name.encode('utf-8')
-
- if email:
- email = email.encode('utf-8')
+ author_data = parse_author(author_str)
- return {'fullname': fullname, 'name': name, 'email': email}
+ return swh_author(author_data)
def _lstrip_bom(s, bom=BOM_UTF8):
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -10,7 +10,7 @@
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.npm.loader import (
- NpmLoader, parse_npm_package_author, extract_npm_package_author,
+ NpmLoader, extract_npm_package_author,
artifact_to_revision_id
)
from swh.loader.package.tests.common import (
@@ -18,148 +18,6 @@
)
-def _parse_author_string_test(author_str, expected_result):
- assert parse_npm_package_author(author_str) == expected_result
- assert parse_npm_package_author(' %s' % author_str) == expected_result
- assert parse_npm_package_author('%s ' % author_str) == expected_result
-
-
-def test_parse_npm_package_author():
- _parse_author_string_test(
- 'John Doe',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- '<john.doe@foo.bar>',
- {
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- '(https://john.doe)',
- {
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- 'John Doe (https://john.doe)',
- {
- 'name': 'John Doe',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe(https://john.doe)',
- {
- 'name': 'John Doe',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- '<john.doe@foo.bar> (https://john.doe)',
- {
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- '(https://john.doe) <john.doe@foo.bar>',
- {
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <john.doe@foo.bar> (https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe (https://john.doe) <john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar> (https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar>(https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test('', {})
- _parse_author_string_test('<>', {})
- _parse_author_string_test(' <>', {})
- _parse_author_string_test('<>()', {})
- _parse_author_string_test('<> ()', {})
- _parse_author_string_test('()', {})
- _parse_author_string_test(' ()', {})
-
- _parse_author_string_test(
- 'John Doe <> ()',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <>',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe ()',
- {
- 'name': 'John Doe'
- }
- )
-
-
def test_extract_npm_package_author(datadir):
package_metadata_filepath = os.path.join(
datadir, 'https_replicate.npmjs.com', 'org_visit1')
diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py
--- a/swh/loader/package/tests/test_utils.py
+++ b/swh/loader/package/tests/test_utils.py
@@ -9,7 +9,9 @@
import swh.loader.package
-from swh.loader.package.utils import download, api_info, release_name
+from swh.loader.package.utils import (
+ download, api_info, release_name, parse_author
+)
def test_version_generation():
@@ -155,3 +157,151 @@
('0.0.1', None, 'releases/0.0.1'),
('0.0.2', 'something', 'releases/0.0.2/something')]:
assert release_name(version, filename) == expected_release
+
+
+def _parse_author_string_test(author_str, expected_result):
+ assert parse_author(author_str) == expected_result
+ assert parse_author(' %s' % author_str) == expected_result
+ assert parse_author('%s ' % author_str) == expected_result
+
+
+def test_parse_author():
+ _parse_author_string_test(
+ 'John Doe',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '<john.doe@foo.bar>',
+ {
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ '(https://john.doe)',
+ {
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe(https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '<john.doe@foo.bar> (https://john.doe)',
+ {
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '(https://john.doe) <john.doe@foo.bar>',
+ {
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <john.doe@foo.bar> (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe (https://john.doe) <john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar> (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar>(https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test('', {})
+ _parse_author_string_test('<>', {})
+ _parse_author_string_test(' <>', {})
+ _parse_author_string_test('<>()', {})
+ _parse_author_string_test('<> ()', {})
+ _parse_author_string_test('()', {})
+ _parse_author_string_test(' ()', {})
+
+ _parse_author_string_test(
+ 'John Doe <> ()',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <>',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe ()',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+
+# def test_swh_author():
+# for author, expected_author in [
+# ({}, )
+# ]:
diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py
--- a/swh/loader/package/utils.py
+++ b/swh/loader/package/utils.py
@@ -7,6 +7,7 @@
import logging
import os
import requests
+import re
from typing import Dict, Optional, Tuple
@@ -20,6 +21,13 @@
DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length'])
+# https://github.com/jonschlinkert/author-regex
+_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
+
+
+_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
+
+
def api_info(url: str) -> Dict:
"""Basic api client to retrieve information on project. This deals with
fetching json metadata about pypi projects.
@@ -111,3 +119,79 @@
if filename:
return 'releases/%s/%s' % (version, filename)
return 'releases/%s' % version
+
+
+def parse_author(author_str: str) -> Dict[str, str]:
+ """
+ Parse npm package author string.
+
+ It works with a flexible range of formats, as detailed below::
+
+ name
+ name <email> (url)
+ name <email>(url)
+ name<email> (url)
+ name<email>(url)
+ name (url) <email>
+ name (url)<email>
+ name(url) <email>
+ name(url)<email>
+ name (url)
+ name(url)
+ name <email>
+ name<email>
+ <email> (url)
+ <email>(url)
+ (url) <email>
+ (url)<email>
+ <email>
+ (url)
+
+ Args:
+ author_str (str): input author string
+
+ Returns:
+ dict: A dict that may contain the following keys:
+ * name
+ * email
+ * url
+
+ """
+ author = {}
+ matches = re.findall(_author_regexp,
+ author_str.replace('<>', '').replace('()', ''),
+ re.M)
+ for match in matches:
+ if match[0].strip():
+ author['name'] = match[0].strip()
+ if match[1].strip():
+ author['email'] = match[1].strip()
+ if match[2].strip():
+ author['url'] = match[2].strip()
+ return author
+
+
+def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]:
+ """Transform an author like dict to an expected swh like dict (values are
+ bytes)
+
+ """
+ name = author.get('name')
+ email = author.get('email')
+
+ fullname = None
+
+ if name and email:
+ fullname = '%s <%s>' % (name, email)
+ elif name:
+ fullname = name
+
+ if not fullname:
+ r = _EMPTY_AUTHOR
+ else:
+ r = {
+ 'fullname': fullname.encode('utf-8') if fullname else None,
+ 'name': name.encode('utf-8') if name else None,
+ 'email': email.encode('utf-8') if email else None
+ }
+ return r
diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py
--- a/swh/loader/tests/test_cli.py
+++ b/swh/loader/tests/test_cli.py
@@ -60,7 +60,7 @@
result = runner.invoke(run, ['-h'])
assert result.exit_code == 0
- expected_help_msg = """Usage: run [OPTIONS] [archive|debian|deposit|npm|pypi] URL [OPTIONS]...
+ expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|npm|pypi] URL [OPTIONS]...
Ingest with loader <type> the origin located at <url>
@@ -89,7 +89,7 @@
runner = CliRunner()
result = runner.invoke(list, ['--help'])
assert result.exit_code == 0
- expected_help_msg = """Usage: list [OPTIONS] [[all|archive|debian|deposit|npm|pypi]]
+ expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|npm|pypi]]
List supported loaders and optionally their arguments

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 7:32 AM (8 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227739

Event Timeline