D2463.id8866.diff
No OneTemporary
Actions

Size

31 KB

Subscribers

None

D2463.id8866.diff
View Options

	diff --git a/conftest.py b/conftest.py
	--- a/conftest.py
	+++ b/conftest.py
	@@ -59,6 +59,7 @@
	def celery_includes():
	return [
	'swh.loader.package.archive.tasks',
	+ 'swh.loader.package.cran.tasks',
	'swh.loader.package.debian.tasks',
	'swh.loader.package.deposit.tasks',
	'swh.loader.package.npm.tasks',
	diff --git a/requirements.txt b/requirements.txt
	--- a/requirements.txt
	+++ b/requirements.txt
	@@ -8,3 +8,4 @@
	iso8601
	pkginfo
	python-debian
	+python-dateutil
	diff --git a/setup.py b/setup.py
	--- a/setup.py
	+++ b/setup.py
	@@ -53,6 +53,7 @@
	entry_points='''
	[swh.workers]
	loader.archive=swh.loader.package.archive:register
	+ loader.cran=swh.loader.package.cran:register
	loader.debian=swh.loader.package.debian:register
	loader.deposit=swh.loader.package.deposit:register
	loader.npm=swh.loader.package.npm:register
	diff --git a/swh/loader/package/cran/__init__.py b/swh/loader/package/cran/__init__.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/package/cran/__init__.py
	@@ -0,0 +1,16 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+from typing import Any, Mapping
	+
	+
	+def register() -> Mapping[str, Any]:
	+ """Register the current worker module's definition"""
	+ from .loader import CRANLoader
	+ return {
	+ 'task_modules': [f'{__name__}.tasks'],
	+ 'loader': CRANLoader,
	+ }
	diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/package/cran/loader.py
	@@ -0,0 +1,160 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import dateutil.parser
	+import datetime
	+import os
	+import logging
	+import re
	+
	+from datetime import timezone
	+from os import path
	+from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple
	+
	+from debian.deb822 import Deb822
	+
	+from swh.loader.package.loader import PackageLoader
	+from swh.loader.package.utils import release_name, parse_author, swh_author
	+from swh.model.identifiers import normalize_timestamp
	+
	+
	+logger = logging.getLogger(__name__)
	+
	+
	+DATE_PATTERN = re.compile(r'^(?P<year>\d{4})-(?P<month>\d{2})$')
	+
	+
	+class CRANLoader(PackageLoader):
	+ visit_type = 'cran'
	+
	+ def __init__(self, url: str, version: str):
	+ """Loader constructor.
	+
	+ Args:
	+ url: Origin url to retrieve cran artifact from
	+ version: version of the cran artifact
	+
	+ """
	+ super().__init__(url=url)
	+ self.version = version
	+ self.provider_url = url
	+
	+ def get_versions(self) -> List[str]:
	+ # only 1 artifact
	+ return [self.version]
	+
	+ def get_default_version(self) -> str:
	+ return self.version
	+
	+ def get_package_info(self, version: str) -> Generator[
	+ Tuple[str, Dict[str, Any]], None, None]:
	+ p_info = {
	+ 'url': self.url,
	+ 'filename': path.split(self.url)[-1],
	+ 'raw': {}
	+ }
	+ yield release_name(version), p_info
	+
	+ def build_revision(
	+ self, a_metadata: Mapping[str, Any],
	+ uncompressed_path: str) -> Dict[str, Any]:
	+ # a_metadata is empty
	+ metadata = extract_intrinsic_metadata(uncompressed_path)
	+ normalized_date = normalize_timestamp(parse_date(metadata.get('Date')))
	+ author = swh_author(parse_author(metadata.get('Maintainer', {})))
	+ version = metadata.get('Version', self.version)
	+ return {
	+ 'message': version.encode('utf-8'),
	+ 'type': 'tar',
	+ 'date': normalized_date,
	+ 'author': author,
	+ 'committer': author,
	+ 'committer_date': normalized_date,
	+ 'parents': [],
	+ 'metadata': {
	+ 'intrinsic': {
	+ 'tool': 'DESCRIPTION',
	+ 'raw': metadata,
	+ },
	+ 'extrinsic': {
	+ 'provider': self.provider_url,
	+ 'when': self.visit_date.isoformat(),
	+ 'raw': a_metadata,
	+ },
	+ },
	+ }
	+
	+
	+def parse_debian_control(filepath: str) -> Dict[str, Any]:
	+ """Parse debian control at filepath"""
	+ metadata: Dict = {}
	+ logger.debug('Debian control file %s', filepath)
	+ for paragraph in Deb822.iter_paragraphs(open(filepath)):
	+ logger.debug('paragraph: %s', paragraph)
	+ metadata.update(**paragraph)
	+
	+ logger.debug('metadata parsed: %s', metadata)
	+ return metadata
	+
	+
	+def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]:
	+ """Given an uncompressed path holding the DESCRIPTION file, returns a
	+ DESCRIPTION parsed structure as a dict.
	+
	+ Cran origins describes their intrinsic metadata within a DESCRIPTION file
	+ at the root tree of a tarball. This DESCRIPTION uses a simple file format
	+ called DCF, the Debian control format.
	+
	+ The release artifact contains at their root one folder. For example:
	+ $ tar tvf zprint-0.0.6.tar.gz
	+ drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
	+ ...
	+
	+ Args:
	+ dir_path (str): Path to the uncompressed directory
	+ representing a release artifact from pypi.
	+
	+ Returns:
	+ the DESCRIPTION parsed structure as a dict (or empty dict if missing)
	+
	+ """
	+ # Retrieve the root folder of the archive
	+ if not os.path.exists(dir_path):
	+ return {}
	+ lst = os.listdir(dir_path)
	+ if len(lst) != 1:
	+ return {}
	+ project_dirname = lst[0]
	+ description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION')
	+ if not os.path.exists(description_path):
	+ return {}
	+ return parse_debian_control(description_path)
	+
	+
	+def parse_date(date: Optional[str]) -> Optional[datetime.datetime]:
	+ """Parse a date into a datetime
	+
	+ """
	+ assert not date or isinstance(date, str)
	+ dt: Optional[datetime.datetime] = None
	+ if not date:
	+ return dt
	+ try:
	+ specific_date = DATE_PATTERN.match(date)
	+ if specific_date:
	+ year = int(specific_date.group('year'))
	+ month = int(specific_date.group('month'))
	+ dt = datetime.datetime(year, month, 1)
	+ else:
	+ dt = dateutil.parser.parse(date)
	+
	+ if not dt.tzinfo:
	+ # up for discussion the timezone needs to be set or
	+ # normalize_timestamp is not happy: ValueError: normalize_timestamp
	+ # received datetime without timezone: 2001-06-08 00:00:00
	+ dt = dt.replace(tzinfo=timezone.utc)
	+ except Exception as e:
	+ logger.warning('Fail to parse date %s. Reason: %s', (date, e))
	+ return dt
	diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/package/cran/tasks.py
	@@ -0,0 +1,14 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+from celery import shared_task
	+
	+from swh.loader.package.cran.loader import CRANLoader
	+
	+
	+@shared_task(name=__name__ + '.LoadCran')
	+def load_cran(url=None, version=None):
	+ """Load archive's artifacts (e.g gnu, etc...)"""
	+ return CRANLoader(url, version).load()
	diff --git a/swh/loader/package/cran/tests/__init__.py b/swh/loader/package/cran/tests/__init__.py
	new file mode 100644
	diff --git a/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz b/swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz
	new file mode 100644
	index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
	GIT binary patch
	literal 0
	Hc$@<O00001

	literal 0
	Hc$@<O00001

	diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/package/cran/tests/test_cran.py
	@@ -0,0 +1,198 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import os
	+import pytest
	+
	+from datetime import datetime, timezone
	+from dateutil.tz import tzlocal
	+
	+from os import path
	+
	+from swh.loader.package.cran.loader import (
	+ extract_intrinsic_metadata, CRANLoader, parse_date
	+)
	+from swh.core.tarball import uncompress
	+
	+from swh.loader.package.tests.common import (
	+ check_snapshot, get_stats
	+)
	+
	+
	+def test_cran_parse_date():
	+ data = [
	+ # parsable, some have debatable results though
	+ ('2001-June-08',
	+ datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)),
	+ ('Tue Dec 27 15:06:08 PST 2011',
	+ datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)),
	+ ('8-14-2013',
	+ datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)),
	+ ('2011-01',
	+ datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)),
	+ ('201109',
	+ datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)),
	+ ('04-12-2014',
	+ datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)),
	+ ('2018-08-24, 10:40:10',
	+ datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)),
	+ ('2013-October-16',
	+ datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)),
	+ ('Aug 23, 2013',
	+ datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)),
	+ ('27-11-2014',
	+ datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)),
	+ ('2019-09-26,',
	+ datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)),
	+ ('9/25/2014',
	+ datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)),
	+ ('Fri Jun 27 17:23:53 2014',
	+ datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)),
	+ ('28-04-2014',
	+ datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)),
	+ ('04-14-2014',
	+ datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)),
	+ ('2019-05-08 14:17:31 UTC',
	+ datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)),
	+ ('Wed May 21 13:50:39 CEST 2014',
	+ datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())),
	+ ('2018-04-10 00:01:04 KST',
	+ datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)),
	+ ('2019-08-25 10:45',
	+ datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)),
	+ ('March 9, 2015',
	+ datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)),
	+ ('Aug. 18, 2012',
	+ datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)),
	+ ('2014-Dec-17',
	+ datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)),
	+ ('March 01, 2013',
	+ datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)),
	+ ('2017-04-08.',
	+ datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)),
	+ ('2014-Apr-22',
	+ datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)),
	+ ('Mon Jan 12 19:54:04 2015',
	+ datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)),
	+ ('May 22, 2014',
	+ datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)),
	+ ('2014-08-12 09:55:10 EDT',
	+ datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)),
	+ # unparsable
	+ ('Fabruary 21, 2012', None),
	+ ('2019-05-28"', None),
	+ ('2017-03-01 today', None),
	+ ('2016-11-0110.1093/icesjms/fsw182', None),
	+ ('2019-07-010', None),
	+ ('2015-02.23', None),
	+ ('20013-12-30', None),
	+ ('2016-08-017', None),
	+ ('2019-02-07l', None),
	+ ('2018-05-010', None),
	+ ('2019-09-27 KST', None),
	+ ('$Date$', None),
	+ ('2019-09-27 KST', None),
	+ ('2019-06-22 $Date$', None),
	+ ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None),
	+ ('2015-7-013', None),
	+ ('2018-05-023', None),
	+ ("Check NEWS file for changes: news(package='simSummary')", None)
	+ ]
	+ for date, expected_date in data:
	+ actual_date = parse_date(date)
	+ assert actual_date == expected_date, f'input date to parse {date}'
	+
	+
	+@pytest.mark.fs
	+def test_extract_intrinsic_metadata(tmp_path, datadir):
	+ """Parsing existing archive's PKG-INFO should yield results"""
	+ uncompressed_archive_path = str(tmp_path)
	+ # sample url
	+ # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa
	+ archive_path = path.join(
	+ datadir, 'https_cran.r-project.org',
	+ 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz')
	+ uncompress(archive_path, dest=uncompressed_archive_path)
	+
	+ actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path)
	+
	+ expected_metadata = {
	+ 'Package': 'KernSmooth',
	+ 'Priority': 'recommended',
	+ 'Version': '2.22-6',
	+ 'Date': '2001-June-08',
	+ 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)',
	+ 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley <ripley@stats.ox.ac.uk>.', # noqa
	+ 'Maintainer': 'Brian Ripley <ripley@stats.ox.ac.uk>',
	+ 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa
	+ 'License': 'Unlimited use and distribution (see LICENCE).',
	+ 'URL': 'http://www.biostat.harvard.edu/~mwand'
	+ }
	+
	+ assert actual_metadata == expected_metadata
	+
	+
	+@pytest.mark.fs
	+def test_extract_intrinsic_metadata_failures(tmp_path):
	+ """Parsing inexistent path/archive/PKG-INFO yield None"""
	+ # inexistent first level path
	+ assert extract_intrinsic_metadata('/something-inexistent') == {}
	+ # inexistent second level path (as expected by pypi archives)
	+ assert extract_intrinsic_metadata(tmp_path) == {}
	+ # inexistent PKG-INFO within second level path
	+ existing_path_no_pkginfo = str(tmp_path / 'something')
	+ os.mkdir(existing_path_no_pkginfo)
	+ assert extract_intrinsic_metadata(tmp_path) == {}
	+
	+
	+def test_cran_one_visit(swh_config, requests_mock_datadir):
	+ version = '2.22-6'
	+ base_url = 'https://cran.r-project.org'
	+ url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa
	+ loader = CRANLoader(url, version=version)
	+
	+ actual_load_status = loader.load()
	+
	+ expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21'
	+ assert actual_load_status == {
	+ 'status': 'eventful',
	+ 'snapshot_id': expected_snapshot_id
	+ }
	+
	+ expected_snapshot = {
	+ 'id': expected_snapshot_id,
	+ 'branches': {
	+ 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'},
	+ f'releases/{version}': {
	+ 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603',
	+ 'target_type': 'revision'
	+ }
	+ }
	+ }
	+ check_snapshot(expected_snapshot, loader.storage)
	+
	+ origin_visit = next(loader.storage.origin_visit_get(url))
	+ assert origin_visit['status'] == 'full'
	+ assert origin_visit['type'] == 'cran'
	+
	+ visit_stats = get_stats(loader.storage)
	+ assert {
	+ 'content': 33,
	+ 'directory': 7,
	+ 'origin': 1,
	+ 'origin_visit': 1,
	+ 'person': 1,
	+ 'release': 0,
	+ 'revision': 1,
	+ 'skipped_content': 0,
	+ 'snapshot': 1
	+ } == visit_stats
	+
	+ urls = [
	+ m.url for m in requests_mock_datadir.request_history
	+ if m.url.startswith(base_url)
	+ ]
	+ # visited each artifact once across 2 visits
	+ assert len(urls) == 1
	diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/package/cran/tests/test_tasks.py
	@@ -0,0 +1,19 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config):
	+ mock_loader = mocker.patch(
	+ 'swh.loader.package.cran.loader.CRANLoader.load')
	+ mock_loader.return_value = {'status': 'eventful'}
	+
	+ res = swh_app.send_task(
	+ 'swh.loader.package.cran.tasks.LoadCran',
	+ (), dict(url='some-url', version='1.2.3'))
	+ assert res
	+ res.wait()
	+ assert res.successful()
	+
	+ assert res.result == {'status': 'eventful'}
	diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
	--- a/swh/loader/package/npm/loader.py
	+++ b/swh/loader/package/npm/loader.py
	@@ -6,7 +6,6 @@
	import json
	import logging
	import os
	-import re

	from codecs import BOM_UTF8
	from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
	@@ -17,18 +16,14 @@
	from urllib.parse import quote
	from swh.model.identifiers import normalize_timestamp
	from swh.loader.package.loader import PackageLoader
	-from swh.loader.package.utils import api_info, release_name
	+from swh.loader.package.utils import (
	+ api_info, release_name, parse_author, swh_author
	+)


	logger = logging.getLogger(__name__)


	-_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
	-
	-# https://github.com/jonschlinkert/author-regex
	-_author_regexp = r'([^<(]+?)?[ \t](?:<([^>(]+?)>)?[ \t](?:$([^)]+?)$\|$)'
	-
	-
	class NpmLoader(PackageLoader):
	"""Load npm origin's artifact releases into swh archive.

	@@ -162,56 +157,6 @@
	return None


	-def parse_npm_package_author(author_str):
	- """
	- Parse npm package author string.
	-
	- It works with a flexible range of formats, as detailed below::
	-
	- name
	- name <email> (url)
	- name <email>(url)
	- name<email> (url)
	- name<email>(url)
	- name (url) <email>
	- name (url)<email>
	- name(url) <email>
	- name(url)<email>
	- name (url)
	- name(url)
	- name <email>
	- name<email>
	- <email> (url)
	- <email>(url)
	- (url) <email>
	- (url)<email>
	- <email>
	- (url)
	-
	- Args:
	- author_str (str): input author string
	-
	- Returns:
	- dict: A dict that may contain the following keys:
	- * name
	- * email
	- * url
	-
	- """
	- author = {}
	- matches = re.findall(_author_regexp,
	- author_str.replace('<>', '').replace('()', ''),
	- re.M)
	- for match in matches:
	- if match[0].strip():
	- author['name'] = match[0].strip()
	- if match[1].strip():
	- author['email'] = match[1].strip()
	- if match[2].strip():
	- author['url'] = match[2].strip()
	- return author
	-
	-
	def extract_npm_package_author(package_json):
	"""
	Extract package author from a ``package.json`` file content and
	@@ -246,31 +191,9 @@
	for author_key in ('author', 'authors'):
	if author_key in package_json:
	author_str = _author_str(package_json[author_key])
	- author_data = parse_npm_package_author(author_str)
	-
	- name = author_data.get('name')
	- email = author_data.get('email')
	-
	- fullname = None
	-
	- if name and email:
	- fullname = '%s <%s>' % (name, email)
	- elif name:
	- fullname = name
	-
	- if not fullname:
	- return _EMPTY_AUTHOR
	-
	- if fullname:
	- fullname = fullname.encode('utf-8')
	-
	- if name:
	- name = name.encode('utf-8')
	-
	- if email:
	- email = email.encode('utf-8')
	+ author_data = parse_author(author_str)

	- return {'fullname': fullname, 'name': name, 'email': email}
	+ return swh_author(author_data)


	def _lstrip_bom(s, bom=BOM_UTF8):
	diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
	--- a/swh/loader/package/npm/tests/test_npm.py
	+++ b/swh/loader/package/npm/tests/test_npm.py
	@@ -10,7 +10,7 @@
	from swh.model.hashutil import hash_to_bytes

	from swh.loader.package.npm.loader import (
	- NpmLoader, parse_npm_package_author, extract_npm_package_author,
	+ NpmLoader, extract_npm_package_author,
	artifact_to_revision_id
	)
	from swh.loader.package.tests.common import (
	@@ -18,148 +18,6 @@
	)


	-def _parse_author_string_test(author_str, expected_result):
	- assert parse_npm_package_author(author_str) == expected_result
	- assert parse_npm_package_author(' %s' % author_str) == expected_result
	- assert parse_npm_package_author('%s ' % author_str) == expected_result
	-
	-
	-def test_parse_npm_package_author():
	- _parse_author_string_test(
	- 'John Doe',
	- {
	- 'name': 'John Doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- '<john.doe@foo.bar>',
	- {
	- 'email': 'john.doe@foo.bar'
	- }
	- )
	-
	- _parse_author_string_test(
	- '(https://john.doe)',
	- {
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe <john.doe@foo.bar>',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe<john.doe@foo.bar>',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe (https://john.doe)',
	- {
	- 'name': 'John Doe',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe(https://john.doe)',
	- {
	- 'name': 'John Doe',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- '<john.doe@foo.bar> (https://john.doe)',
	- {
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- '(https://john.doe) <john.doe@foo.bar>',
	- {
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe <john.doe@foo.bar> (https://john.doe)',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe (https://john.doe) <john.doe@foo.bar>',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe<john.doe@foo.bar> (https://john.doe)',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe<john.doe@foo.bar>(https://john.doe)',
	- {
	- 'name': 'John Doe',
	- 'email': 'john.doe@foo.bar',
	- 'url': 'https://john.doe'
	- }
	- )
	-
	- _parse_author_string_test('', {})
	- _parse_author_string_test('<>', {})
	- _parse_author_string_test(' <>', {})
	- _parse_author_string_test('<>()', {})
	- _parse_author_string_test('<> ()', {})
	- _parse_author_string_test('()', {})
	- _parse_author_string_test(' ()', {})
	-
	- _parse_author_string_test(
	- 'John Doe <> ()',
	- {
	- 'name': 'John Doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe <>',
	- {
	- 'name': 'John Doe'
	- }
	- )
	-
	- _parse_author_string_test(
	- 'John Doe ()',
	- {
	- 'name': 'John Doe'
	- }
	- )
	-
	-
	def test_extract_npm_package_author(datadir):
	package_metadata_filepath = os.path.join(
	datadir, 'https_replicate.npmjs.com', 'org_visit1')
	diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py
	--- a/swh/loader/package/tests/test_utils.py
	+++ b/swh/loader/package/tests/test_utils.py
	@@ -9,7 +9,9 @@


	import swh.loader.package
	-from swh.loader.package.utils import download, api_info, release_name
	+from swh.loader.package.utils import (
	+ download, api_info, release_name, parse_author
	+)


	def test_version_generation():
	@@ -155,3 +157,151 @@
	('0.0.1', None, 'releases/0.0.1'),
	('0.0.2', 'something', 'releases/0.0.2/something')]:
	assert release_name(version, filename) == expected_release
	+
	+
	+def _parse_author_string_test(author_str, expected_result):
	+ assert parse_author(author_str) == expected_result
	+ assert parse_author(' %s' % author_str) == expected_result
	+ assert parse_author('%s ' % author_str) == expected_result
	+
	+
	+def test_parse_author():
	+ _parse_author_string_test(
	+ 'John Doe',
	+ {
	+ 'name': 'John Doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ '<john.doe@foo.bar>',
	+ {
	+ 'email': 'john.doe@foo.bar'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ '(https://john.doe)',
	+ {
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe <john.doe@foo.bar>',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe<john.doe@foo.bar>',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe (https://john.doe)',
	+ {
	+ 'name': 'John Doe',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe(https://john.doe)',
	+ {
	+ 'name': 'John Doe',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ '<john.doe@foo.bar> (https://john.doe)',
	+ {
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ '(https://john.doe) <john.doe@foo.bar>',
	+ {
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe <john.doe@foo.bar> (https://john.doe)',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe (https://john.doe) <john.doe@foo.bar>',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe<john.doe@foo.bar> (https://john.doe)',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe<john.doe@foo.bar>(https://john.doe)',
	+ {
	+ 'name': 'John Doe',
	+ 'email': 'john.doe@foo.bar',
	+ 'url': 'https://john.doe'
	+ }
	+ )
	+
	+ _parse_author_string_test('', {})
	+ _parse_author_string_test('<>', {})
	+ _parse_author_string_test(' <>', {})
	+ _parse_author_string_test('<>()', {})
	+ _parse_author_string_test('<> ()', {})
	+ _parse_author_string_test('()', {})
	+ _parse_author_string_test(' ()', {})
	+
	+ _parse_author_string_test(
	+ 'John Doe <> ()',
	+ {
	+ 'name': 'John Doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe <>',
	+ {
	+ 'name': 'John Doe'
	+ }
	+ )
	+
	+ _parse_author_string_test(
	+ 'John Doe ()',
	+ {
	+ 'name': 'John Doe'
	+ }
	+ )
	+
	+
	+# def test_swh_author():
	+# for author, expected_author in [
	+# ({}, )
	+# ]:
	diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py
	--- a/swh/loader/package/utils.py
	+++ b/swh/loader/package/utils.py
	@@ -7,6 +7,7 @@
	import logging
	import os
	import requests
	+import re

	from typing import Dict, Optional, Tuple

	@@ -20,6 +21,13 @@
	DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length'])


	+# https://github.com/jonschlinkert/author-regex
	+_author_regexp = r'([^<(]+?)?[ \t](?:<([^>(]+?)>)?[ \t](?:$([^)]+?)$\|$)'
	+
	+
	+_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
	+
	+
	def api_info(url: str) -> Dict:
	"""Basic api client to retrieve information on project. This deals with
	fetching json metadata about pypi projects.
	@@ -111,3 +119,79 @@
	if filename:
	return 'releases/%s/%s' % (version, filename)
	return 'releases/%s' % version
	+
	+
	+def parse_author(author_str: str) -> Dict[str, str]:
	+ """
	+ Parse npm package author string.
	+
	+ It works with a flexible range of formats, as detailed below::
	+
	+ name
	+ name <email> (url)
	+ name <email>(url)
	+ name<email> (url)
	+ name<email>(url)
	+ name (url) <email>
	+ name (url)<email>
	+ name(url) <email>
	+ name(url)<email>
	+ name (url)
	+ name(url)
	+ name <email>
	+ name<email>
	+ <email> (url)
	+ <email>(url)
	+ (url) <email>
	+ (url)<email>
	+ <email>
	+ (url)
	+
	+ Args:
	+ author_str (str): input author string
	+
	+ Returns:
	+ dict: A dict that may contain the following keys:
	+ * name
	+ * email
	+ * url
	+
	+ """
	+ author = {}
	+ matches = re.findall(_author_regexp,
	+ author_str.replace('<>', '').replace('()', ''),
	+ re.M)
	+ for match in matches:
	+ if match[0].strip():
	+ author['name'] = match[0].strip()
	+ if match[1].strip():
	+ author['email'] = match[1].strip()
	+ if match[2].strip():
	+ author['url'] = match[2].strip()
	+ return author
	+
	+
	+def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]:
	+ """Transform an author like dict to an expected swh like dict (values are
	+ bytes)
	+
	+ """
	+ name = author.get('name')
	+ email = author.get('email')
	+
	+ fullname = None
	+
	+ if name and email:
	+ fullname = '%s <%s>' % (name, email)
	+ elif name:
	+ fullname = name
	+
	+ if not fullname:
	+ r = _EMPTY_AUTHOR
	+ else:
	+ r = {
	+ 'fullname': fullname.encode('utf-8') if fullname else None,
	+ 'name': name.encode('utf-8') if name else None,
	+ 'email': email.encode('utf-8') if email else None
	+ }
	+ return r
	diff --git a/swh/loader/tests/test_cli.py b/swh/loader/tests/test_cli.py
	--- a/swh/loader/tests/test_cli.py
	+++ b/swh/loader/tests/test_cli.py
	@@ -60,7 +60,7 @@
	result = runner.invoke(run, ['-h'])

	assert result.exit_code == 0
	- expected_help_msg = """Usage: run [OPTIONS] [archive\|debian\|deposit\|npm\|pypi] URL [OPTIONS]...
	+ expected_help_msg = """Usage: run [OPTIONS] [archive\|cran\|debian\|deposit\|npm\|pypi] URL [OPTIONS]...

	Ingest with loader <type> the origin located at <url>

	@@ -89,7 +89,7 @@
	runner = CliRunner()
	result = runner.invoke(list, ['--help'])
	assert result.exit_code == 0
	- expected_help_msg = """Usage: list [OPTIONS] [[all\|archive\|debian\|deposit\|npm\|pypi]]
	+ expected_help_msg = """Usage: list [OPTIONS] [[all\|archive\|cran\|debian\|deposit\|npm\|pypi]]

	List supported loaders and optionally their arguments

File Metadata

Mime Type: text/plain
Expires: Nov 5 2024, 7:32 AM (8 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3227739

D2463.id8866.diffNo OneTemporaryActions

D2463.id8866.diffView Options

File Metadata

Event Timeline

D2463.id8866.diff
No OneTemporary
Actions

D2463.id8866.diff
View Options