Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066394
D2463.id8866.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
31 KB
Subscribers
None
D2463.id8866.diff
View Options
diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -59,6 +59,7 @@
def celery_includes():
return [
'swh.loader.package.archive.tasks',
+ 'swh.loader.package.cran.tasks',
'swh.loader.package.debian.tasks',
'swh.loader.package.deposit.tasks',
'swh.loader.package.npm.tasks',
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@
iso8601
pkginfo
python-debian
+python-dateutil
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@
entry_points='''
[swh.workers]
loader.archive=swh.loader.package.archive:register
+ loader.cran=swh.loader.package.cran:register
loader.debian=swh.loader.package.debian:register
loader.deposit=swh.loader.package.deposit:register
loader.npm=swh.loader.package.npm:register
diff --git a/swh/loader/package/cran/__init__.py b/swh/loader/package/cran/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from typing import Any, Mapping
+
+
+def register() -> Mapping[str, Any]:
+ """Register the current worker module's definition"""
+ from .loader import CRANLoader
+ return {
+ 'task_modules': [f'{__name__}.tasks'],
+ 'loader': CRANLoader,
+ }
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/loader.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import dateutil.parser
+import datetime
+import os
+import logging
+import re
+
+from datetime import timezone
+from os import path
+from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple
+
+from debian.deb822 import Deb822
+
+from swh.loader.package.loader import PackageLoader
+from swh.loader.package.utils import release_name, parse_author, swh_author
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+DATE_PATTERN = re.compile(r'^(?P<year>\d{4})-(?P<month>\d{2})$')
+
+
+class CRANLoader(PackageLoader):
+ visit_type = 'cran'
+
+ def __init__(self, url: str, version: str):
+ """Loader constructor.
+
+ Args:
+ url: Origin url to retrieve cran artifact from
+ version: version of the cran artifact
+
+ """
+ super().__init__(url=url)
+ self.version = version
+ self.provider_url = url
+
+ def get_versions(self) -> List[str]:
+ # only 1 artifact
+ return [self.version]
+
+ def get_default_version(self) -> str:
+ return self.version
+
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Dict[str, Any]], None, None]:
+ p_info = {
+ 'url': self.url,
+ 'filename': path.split(self.url)[-1],
+ 'raw': {}
+ }
+ yield release_name(version), p_info
+
+ def build_revision(
+ self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict[str, Any]:
+ # a_metadata is empty
+ metadata = extract_intrinsic_metadata(uncompressed_path)
+ normalized_date = normalize_timestamp(parse_date(metadata.get('Date')))
+ author = swh_author(parse_author(metadata.get('Maintainer', {})))
+ version = metadata.get('Version', self.version)
+ return {
+ 'message': version.encode('utf-8'),
+ 'type': 'tar',
+ 'date': normalized_date,
+ 'author': author,
+ 'committer': author,
+ 'committer_date': normalized_date,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {
+ 'tool': 'DESCRIPTION',
+ 'raw': metadata,
+ },
+ 'extrinsic': {
+ 'provider': self.provider_url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
+
+
+def parse_debian_control(filepath: str) -> Dict[str, Any]:
+ """Parse debian control at filepath"""
+ metadata: Dict = {}
+ logger.debug('Debian control file %s', filepath)
+ for paragraph in Deb822.iter_paragraphs(open(filepath)):
+ logger.debug('paragraph: %s', paragraph)
+ metadata.update(**paragraph)
+
+ logger.debug('metadata parsed: %s', metadata)
+ return metadata
+
+
+def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]:
+ """Given an uncompressed path holding the DESCRIPTION file, returns a
+ DESCRIPTION parsed structure as a dict.
+
+ Cran origins describes their intrinsic metadata within a DESCRIPTION file
+ at the root tree of a tarball. This DESCRIPTION uses a simple file format
+ called DCF, the Debian control format.
+
+ The release artifact contains at their root one folder. For example:
+ $ tar tvf zprint-0.0.6.tar.gz
+ drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
+ ...
+
+ Args:
+ dir_path (str): Path to the uncompressed directory
+ representing a release artifact from pypi.
+
+ Returns:
+ the DESCRIPTION parsed structure as a dict (or empty dict if missing)
+
+ """
+ # Retrieve the root folder of the archive
+ if not os.path.exists(dir_path):
+ return {}
+ lst = os.listdir(dir_path)
+ if len(lst) != 1:
+ return {}
+ project_dirname = lst[0]
+ description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION')
+ if not os.path.exists(description_path):
+ return {}
+ return parse_debian_control(description_path)
+
+
+def parse_date(date: Optional[str]) -> Optional[datetime.datetime]:
+ """Parse a date into a datetime
+
+ """
+ assert not date or isinstance(date, str)
+ dt: Optional[datetime.datetime] = None
+ if not date:
+ return dt
+ try:
+ specific_date = DATE_PATTERN.match(date)
+ if specific_date:
+ year = int(specific_date.group('year'))
+ month = int(specific_date.group('month'))
+ dt = datetime.datetime(year, month, 1)
+ else:
+ dt = dateutil.parser.parse(date)
+
+ if not dt.tzinfo:
+ # up for discussion the timezone needs to be set or
+ # normalize_timestamp is not happy: ValueError: normalize_timestamp
+ # received datetime without timezone: 2001-06-08 00:00:00
+ dt = dt.replace(tzinfo=timezone.utc)
+ except Exception as e:
+ logger.warning('Fail to parse date %s. Reason: %s', (date, e))
+ return dt
diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tasks.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.loader.package.cran.loader import CRANLoader
+
+
+@shared_task(name=__name__ + '.LoadCran')
+def load_cran(url=None, version=None):
+ """Load archive's artifacts (e.g gnu, etc...)"""
+ return CRANLoader(url, version).load()
diff --git a/swh/loader/package/cran/tests/__init__.py b/swh/loader/package/cran/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tests/test_cran.py
@@ -0,0 +1,198 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import pytest
+
+from datetime import datetime, timezone
+from dateutil.tz import tzlocal
+
+from os import path
+
+from swh.loader.package.cran.loader import (
+ extract_intrinsic_metadata, CRANLoader, parse_date
+)
+from swh.core.tarball import uncompress
+
+from swh.loader.package.tests.common import (
+ check_snapshot, get_stats
+)
+
+
+def test_cran_parse_date():
+ data = [
+ # parsable, some have debatable results though
+ ('2001-June-08',
+ datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)),
+ ('Tue Dec 27 15:06:08 PST 2011',
+ datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)),
+ ('8-14-2013',
+ datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)),
+ ('2011-01',
+ datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)),
+ ('201109',
+ datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)),
+ ('04-12-2014',
+ datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)),
+ ('2018-08-24, 10:40:10',
+ datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)),
+ ('2013-October-16',
+ datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)),
+ ('Aug 23, 2013',
+ datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)),
+ ('27-11-2014',
+ datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)),
+ ('2019-09-26,',
+ datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)),
+ ('9/25/2014',
+ datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)),
+ ('Fri Jun 27 17:23:53 2014',
+ datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)),
+ ('28-04-2014',
+ datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)),
+ ('04-14-2014',
+ datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)),
+ ('2019-05-08 14:17:31 UTC',
+ datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)),
+ ('Wed May 21 13:50:39 CEST 2014',
+ datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())),
+ ('2018-04-10 00:01:04 KST',
+ datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)),
+ ('2019-08-25 10:45',
+ datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)),
+ ('March 9, 2015',
+ datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)),
+ ('Aug. 18, 2012',
+ datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)),
+ ('2014-Dec-17',
+ datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)),
+ ('March 01, 2013',
+ datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)),
+ ('2017-04-08.',
+ datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)),
+ ('2014-Apr-22',
+ datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)),
+ ('Mon Jan 12 19:54:04 2015',
+ datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)),
+ ('May 22, 2014',
+ datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)),
+ ('2014-08-12 09:55:10 EDT',
+ datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)),
+ # unparsable
+ ('Fabruary 21, 2012', None),
+ ('2019-05-28"', None),
+ ('2017-03-01 today', None),
+ ('2016-11-0110.1093/icesjms/fsw182', None),
+ ('2019-07-010', None),
+ ('2015-02.23', None),
+ ('20013-12-30', None),
+ ('2016-08-017', None),
+ ('2019-02-07l', None),
+ ('2018-05-010', None),
+ ('2019-09-27 KST', None),
+ ('$Date$', None),
+ ('2019-09-27 KST', None),
+ ('2019-06-22 $Date$', None),
+ ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None),
+ ('2015-7-013', None),
+ ('2018-05-023', None),
+ ("Check NEWS file for changes: news(package='simSummary')", None)
+ ]
+ for date, expected_date in data:
+ actual_date = parse_date(date)
+ assert actual_date == expected_date, f'input date to parse {date}'
+
+
+@pytest.mark.fs
+def test_extract_intrinsic_metadata(tmp_path, datadir):
+ """Parsing existing archive's PKG-INFO should yield results"""
+ uncompressed_archive_path = str(tmp_path)
+ # sample url
+ # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa
+ archive_path = path.join(
+ datadir, 'https_cran.r-project.org',
+ 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz')
+ uncompress(archive_path, dest=uncompressed_archive_path)
+
+ actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path)
+
+ expected_metadata = {
+ 'Package': 'KernSmooth',
+ 'Priority': 'recommended',
+ 'Version': '2.22-6',
+ 'Date': '2001-June-08',
+ 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)',
+ 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley <ripley@stats.ox.ac.uk>.', # noqa
+ 'Maintainer': 'Brian Ripley <ripley@stats.ox.ac.uk>',
+ 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa
+ 'License': 'Unlimited use and distribution (see LICENCE).',
+ 'URL': 'http://www.biostat.harvard.edu/~mwand'
+ }
+
+ assert actual_metadata == expected_metadata
+
+
+@pytest.mark.fs
+def test_extract_intrinsic_metadata_failures(tmp_path):
+ """Parsing inexistent path/archive/PKG-INFO yield None"""
+ # inexistent first level path
+ assert extract_intrinsic_metadata('/something-inexistent') == {}
+ # inexistent second level path (as expected by pypi archives)
+ assert extract_intrinsic_metadata(tmp_path) == {}
+ # inexistent PKG-INFO within second level path
+ existing_path_no_pkginfo = str(tmp_path / 'something')
+ os.mkdir(existing_path_no_pkginfo)
+ assert extract_intrinsic_metadata(tmp_path) == {}
+
+
+def test_cran_one_visit(swh_config, requests_mock_datadir):
+ version = '2.22-6'
+ base_url = 'https://cran.r-project.org'
+ url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa
+ loader = CRANLoader(url, version=version)
+
+ actual_load_status = loader.load()
+
+ expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21'
+ assert actual_load_status == {
+ 'status': 'eventful',
+ 'snapshot_id': expected_snapshot_id
+ }
+
+ expected_snapshot = {
+ 'id': expected_snapshot_id,
+ 'branches': {
+ 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'},
+ f'releases/{version}': {
+ 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603',
+ 'target_type': 'revision'
+ }
+ }
+ }
+ check_snapshot(expected_snapshot, loader.storage)
+
+ origin_visit = next(loader.storage.origin_visit_get(url))
+ assert origin_visit['status'] == 'full'
+ assert origin_visit['type'] == 'cran'
+
+ visit_stats = get_stats(loader.storage)
+ assert {
+ 'content': 33,
+ 'directory': 7,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 1,
+ 'release': 0,
+ 'revision': 1,
+ 'skipped_content': 0,
+ 'snapshot': 1
+ } == visit_stats
+
+ urls = [
+ m.url for m in requests_mock_datadir.request_history
+ if m.url.startswith(base_url)
+ ]
+ # visited each artifact once across 2 visits
+ assert len(urls) == 1
diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cran/tests/test_tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config):
+ mock_loader = mocker.patch(
+ 'swh.loader.package.cran.loader.CRANLoader.load')
+ mock_loader.return_value = {'status': 'eventful'}
+
+ res = swh_app.send_task(
+ 'swh.loader.package.cran.tasks.LoadCran',
+ (), dict(url='some-url', version='1.2.3'))
+ assert res
+ res.wait()
+ assert res.successful()
+
+ assert res.result == {'status': 'eventful'}
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -6,7 +6,6 @@
import json
import logging
import os
-import re
from codecs import BOM_UTF8
from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
@@ -17,18 +16,14 @@
from urllib.parse import quote
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import api_info, release_name
+from swh.loader.package.utils import (
+ api_info, release_name, parse_author, swh_author
+)
logger = logging.getLogger(__name__)
-_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
-
-# https://github.com/jonschlinkert/author-regex
-_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
-
-
class NpmLoader(PackageLoader):
"""Load npm origin's artifact releases into swh archive.
@@ -162,56 +157,6 @@
return None
-def parse_npm_package_author(author_str):
- """
- Parse npm package author string.
-
- It works with a flexible range of formats, as detailed below::
-
- name
- name <email> (url)
- name <email>(url)
- name<email> (url)
- name<email>(url)
- name (url) <email>
- name (url)<email>
- name(url) <email>
- name(url)<email>
- name (url)
- name(url)
- name <email>
- name<email>
- <email> (url)
- <email>(url)
- (url) <email>
- (url)<email>
- <email>
- (url)
-
- Args:
- author_str (str): input author string
-
- Returns:
- dict: A dict that may contain the following keys:
- * name
- * email
- * url
-
- """
- author = {}
- matches = re.findall(_author_regexp,
- author_str.replace('<>', '').replace('()', ''),
- re.M)
- for match in matches:
- if match[0].strip():
- author['name'] = match[0].strip()
- if match[1].strip():
- author['email'] = match[1].strip()
- if match[2].strip():
- author['url'] = match[2].strip()
- return author
-
-
def extract_npm_package_author(package_json):
"""
Extract package author from a ``package.json`` file content and
@@ -246,31 +191,9 @@
for author_key in ('author', 'authors'):
if author_key in package_json:
author_str = _author_str(package_json[author_key])
- author_data = parse_npm_package_author(author_str)
-
- name = author_data.get('name')
- email = author_data.get('email')
-
- fullname = None
-
- if name and email:
- fullname = '%s <%s>' % (name, email)
- elif name:
- fullname = name
-
- if not fullname:
- return _EMPTY_AUTHOR
-
- if fullname:
- fullname = fullname.encode('utf-8')
-
- if name:
- name = name.encode('utf-8')
-
- if email:
- email = email.encode('utf-8')
+ author_data = parse_author(author_str)
- return {'fullname': fullname, 'name': name, 'email': email}
+ return swh_author(author_data)
def _lstrip_bom(s, bom=BOM_UTF8):
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -10,7 +10,7 @@
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.npm.loader import (
- NpmLoader, parse_npm_package_author, extract_npm_package_author,
+ NpmLoader, extract_npm_package_author,
artifact_to_revision_id
)
from swh.loader.package.tests.common import (
@@ -18,148 +18,6 @@
)
-def _parse_author_string_test(author_str, expected_result):
- assert parse_npm_package_author(author_str) == expected_result
- assert parse_npm_package_author(' %s' % author_str) == expected_result
- assert parse_npm_package_author('%s ' % author_str) == expected_result
-
-
-def test_parse_npm_package_author():
- _parse_author_string_test(
- 'John Doe',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- '<john.doe@foo.bar>',
- {
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- '(https://john.doe)',
- {
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar'
- }
- )
-
- _parse_author_string_test(
- 'John Doe (https://john.doe)',
- {
- 'name': 'John Doe',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe(https://john.doe)',
- {
- 'name': 'John Doe',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- '<john.doe@foo.bar> (https://john.doe)',
- {
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- '(https://john.doe) <john.doe@foo.bar>',
- {
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <john.doe@foo.bar> (https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe (https://john.doe) <john.doe@foo.bar>',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar> (https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe<john.doe@foo.bar>(https://john.doe)',
- {
- 'name': 'John Doe',
- 'email': 'john.doe@foo.bar',
- 'url': 'https://john.doe'
- }
- )
-
- _parse_author_string_test('', {})
- _parse_author_string_test('<>', {})
- _parse_author_string_test(' <>', {})
- _parse_author_string_test('<>()', {})
- _parse_author_string_test('<> ()', {})
- _parse_author_string_test('()', {})
- _parse_author_string_test(' ()', {})
-
- _parse_author_string_test(
- 'John Doe <> ()',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe <>',
- {
- 'name': 'John Doe'
- }
- )
-
- _parse_author_string_test(
- 'John Doe ()',
- {
- 'name': 'John Doe'
- }
- )
-
-
def test_extract_npm_package_author(datadir):
package_metadata_filepath = os.path.join(
datadir, 'https_replicate.npmjs.com', 'org_visit1')
diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py
--- a/swh/loader/package/tests/test_utils.py
+++ b/swh/loader/package/tests/test_utils.py
@@ -9,7 +9,9 @@
import swh.loader.package
-from swh.loader.package.utils import download, api_info, release_name
+from swh.loader.package.utils import (
+ download, api_info, release_name, parse_author
+)
def test_version_generation():
@@ -155,3 +157,151 @@
('0.0.1', None, 'releases/0.0.1'),
('0.0.2', 'something', 'releases/0.0.2/something')]:
assert release_name(version, filename) == expected_release
+
+
+def _parse_author_string_test(author_str, expected_result):
+ assert parse_author(author_str) == expected_result
+ assert parse_author(' %s' % author_str) == expected_result
+ assert parse_author('%s ' % author_str) == expected_result
+
+
+def test_parse_author():
+ _parse_author_string_test(
+ 'John Doe',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '<john.doe@foo.bar>',
+ {
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ '(https://john.doe)',
+ {
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe(https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '<john.doe@foo.bar> (https://john.doe)',
+ {
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ '(https://john.doe) <john.doe@foo.bar>',
+ {
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <john.doe@foo.bar> (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe (https://john.doe) <john.doe@foo.bar>',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar> (https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe<john.doe@foo.bar>(https://john.doe)',
+ {
+ 'name': 'John Doe',
+ 'email': 'john.doe@foo.bar',
+ 'url': 'https://john.doe'
+ }
+ )
+
+ _parse_author_string_test('', {})
+ _parse_author_string_test('<>', {})
+ _parse_author_string_test(' <>', {})
+ _parse_author_string_test('<>()', {})
+ _parse_author_string_test('<> ()', {})
+ _parse_author_string_test('()', {})
+ _parse_author_string_test(' ()', {})
+
+ _parse_author_string_test(
+ 'John Doe <> ()',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe <>',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+ _parse_author_string_test(
+ 'John Doe ()',
+ {
+ 'name': 'John Doe'
+ }
+ )
+
+
+# def test_swh_author():
+# for author, expected_author in [
+# ({}, )
+# ]:
diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py
--- a/swh/loader/package/utils.py
+++ b/swh/loader/package/utils.py
@@ -7,6 +7,7 @@
import logging
import os
import requests
+import re
from typing import Dict, Optional, Tuple
@@ -20,6 +21,13 @@
DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length'])
+# https://github.com/jonschlinkert/author-regex
+_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
+
+
+_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
+
+
def api_info(url: str) -> Dict:
"""Basic api client to retrieve information on project. This deals with
fetching json metadata about pypi projects.
@@ -111,3 +119,79 @@
if filename:
return 'releases/%s/%s' % (version, filename)
return 'releases/%s' % version
+
+
+def parse_author(author_str: str) -> Dict[str, str]:
+ """
+ Parse npm package author string.
+
+ It works with a flexible range of formats, as detailed below::
+
+ name
+ name <email> (url)
+ name <email>(url)
+ name<email> (url)
+ name<email>(url)
+ name (url) <email>
+ name (url)<email>
+ name(url) <email>
+ name(url)<email>
+ name (url)
+ name(url)
+ name <email>
+ name<email>
+ <email> (url)
+ <email>(url)
+ (url) <email>
+ (url)<email>
+ <email>
+ (url)
+
+ Args:
+ author_str (str): input author string
+
+ Returns:
+ dict: A dict that may contain the following keys:
+ * name
+ * email
+ * url
+
+ """
+ author = {}
+ matches = re.findall(_author_regexp,
+ author_str.replace('<>', '').replace('()', ''),
+ re.M)
+ for match in matches:
+ if match[0].strip():
+ author['name'] = match[0].strip()
+ if match[1].strip():
+ author['email'] = match[1].strip()
+ if match[2].strip():
+ author['url'] = match[2].strip()
+ return author
+
+
+def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]:
+ """Transform an author like dict to an expected swh like dict (values are
+ bytes)
+
+ """
+ name = author.get('name')
+ email = author.get('email')
+
+ fullname = None
+
+ if name and email:
+ fullname = '%s <%s>' % (name, email)
+ elif name:
+ fullname = name
+
+ if not fullname:
+ r = _EMPTY_AUTHOR
+ else:
+ r = {
+ 'fullname': fullname.encode('utf-8') if fullname else None,
+ 'name': name.encode('utf-8') if name else None,
+ 'email': email.encode('utf-8') if email else None
+ }
+ return r
diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py
--- a/swh/loader/tests/test_cli.py
+++ b/swh/loader/tests/test_cli.py
@@ -60,7 +60,7 @@
result = runner.invoke(run, ['-h'])
assert result.exit_code == 0
- expected_help_msg = """Usage: run [OPTIONS] [archive|debian|deposit|npm|pypi] URL [OPTIONS]...
+ expected_help_msg = """Usage: run [OPTIONS] [archive|cran|debian|deposit|npm|pypi] URL [OPTIONS]...
Ingest with loader <type> the origin located at <url>
@@ -89,7 +89,7 @@
runner = CliRunner()
result = runner.invoke(list, ['--help'])
assert result.exit_code == 0
- expected_help_msg = """Usage: list [OPTIONS] [[all|archive|debian|deposit|npm|pypi]]
+ expected_help_msg = """Usage: list [OPTIONS] [[all|archive|cran|debian|deposit|npm|pypi]]
List supported loaders and optionally their arguments
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 7:32 AM (8 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227739
Attached To
D2463: cran loader: Add implementation
Event Timeline
Log In to Comment