diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py index 9360c82..239c653 100644 --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -1,181 +1,186 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser import datetime import os import logging import re from datetime import timezone from os import path from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple from debian.deb822 import Deb822 from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import ( release_name, parse_author, swh_author, artifact_identity ) from swh.model.identifiers import normalize_timestamp logger = logging.getLogger(__name__) DATE_PATTERN = re.compile(r'^(?P\d{4})-(?P\d{2})$') class CRANLoader(PackageLoader): visit_type = 'cran' - def __init__(self, url: str, version: str): + def __init__(self, url: str, artifacts: List[Dict]): """Loader constructor. Args: - url: Origin url to retrieve cran artifact from - version: version of the cran artifact + url: Origin url to retrieve cran artifact(s) from + artifacts: List of associated artifact for the origin url """ super().__init__(url=url) - self.version = version # explicit what we consider the artifact identity self.id_keys = ['url', 'version'] - self.artifact = {'url': url, 'version': version} + self.artifacts = artifacts def get_versions(self) -> List[str]: - # only 1 artifact - return [self.version] + versions = [] + for artifact in self.artifacts: + versions.append(artifact['version']) + return versions def get_default_version(self) -> str: - return self.version + return self.artifacts[-1]['version'] def get_package_info(self, version: str) -> Generator[ Tuple[str, Dict[str, Any]], None, None]: - p_info = { - 'url': self.url, - 'filename': path.basename(self.url), - 'raw': self.artifact, - } - yield release_name(version), p_info + for a_metadata in self.artifacts: + url = a_metadata['url'] + package_version = a_metadata['version'] + if version == package_version: + p_info = { + 'url': url, + 'filename': path.basename(url), + 'raw': a_metadata, + } + yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Mapping[bytes, Mapping], artifact_metadata: Mapping[str, Any]) \ -> Optional[bytes]: """Given known_artifacts per revision, try to determine the revision for artifact_metadata """ new_identity = artifact_identity(artifact_metadata, self.id_keys) for rev_id, known_artifact_meta in known_artifacts.items(): logging.debug('known_artifact_meta: %s', known_artifact_meta) known_artifact = known_artifact_meta['extrinsic']['raw'] known_identity = artifact_identity(known_artifact, self.id_keys) if new_identity == known_identity: return rev_id return None def build_revision( self, a_metadata: Mapping[str, Any], uncompressed_path: str) -> Dict[str, Any]: # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) normalized_date = normalize_timestamp(parse_date(metadata.get('Date'))) author = swh_author(parse_author(metadata.get('Maintainer', {}))) - version = metadata.get('Version', self.version) + version = metadata.get('Version', a_metadata['version']) return { 'message': version.encode('utf-8'), 'type': 'tar', 'date': normalized_date, 'author': author, 'committer': author, 'committer_date': normalized_date, 'parents': [], 'metadata': { 'intrinsic': { 'tool': 'DESCRIPTION', 'raw': metadata, }, 'extrinsic': { 'provider': self.url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }, } def parse_debian_control(filepath: str) -> Dict[str, Any]: """Parse debian control at filepath""" metadata: Dict = {} logger.debug('Debian control file %s', filepath) for paragraph in Deb822.iter_paragraphs(open(filepath, 'rb')): logger.debug('paragraph: %s', paragraph) metadata.update(**paragraph) logger.debug('metadata parsed: %s', metadata) return metadata def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]: """Given an uncompressed path holding the DESCRIPTION file, returns a DESCRIPTION parsed structure as a dict. Cran origins describes their intrinsic metadata within a DESCRIPTION file at the root tree of a tarball. This DESCRIPTION uses a simple file format called DCF, the Debian control format. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the DESCRIPTION parsed structure as a dict (or empty dict if missing) """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] description_path = os.path.join(dir_path, project_dirname, 'DESCRIPTION') if not os.path.exists(description_path): return {} return parse_debian_control(description_path) def parse_date(date: Optional[str]) -> Optional[datetime.datetime]: """Parse a date into a datetime """ assert not date or isinstance(date, str) dt: Optional[datetime.datetime] = None if not date: return dt try: specific_date = DATE_PATTERN.match(date) if specific_date: year = int(specific_date.group('year')) month = int(specific_date.group('month')) dt = datetime.datetime(year, month, 1) else: dt = dateutil.parser.parse(date) if not dt.tzinfo: # up for discussion the timezone needs to be set or # normalize_timestamp is not happy: ValueError: normalize_timestamp # received datetime without timezone: 2001-06-08 00:00:00 dt = dt.replace(tzinfo=timezone.utc) except Exception as e: logger.warning('Fail to parse date %s. Reason: %s', (date, e)) return dt diff --git a/swh/loader/package/cran/tasks.py b/swh/loader/package/cran/tasks.py index 6cbd788..cd6111a 100644 --- a/swh/loader/package/cran/tasks.py +++ b/swh/loader/package/cran/tasks.py @@ -1,14 +1,14 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task from swh.loader.package.cran.loader import CRANLoader @shared_task(name=__name__ + '.LoadCRAN') -def load_cran(url=None, version=None): - """Load archive's artifacts (e.g gnu, etc...)""" - return CRANLoader(url, version).load() +def load_cran(url=None, artifacts=[]): + """Load CRAN's artifacts""" + return CRANLoader(url, artifacts).load() diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py index e807c86..017c2fe 100644 --- a/swh/loader/package/cran/tests/test_cran.py +++ b/swh/loader/package/cran/tests/test_cran.py @@ -1,317 +1,325 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest from datetime import datetime, timezone from dateutil.tz import tzlocal from os import path from swh.loader.package.cran.loader import ( extract_intrinsic_metadata, CRANLoader, parse_date, parse_debian_control ) from swh.core.tarball import uncompress from swh.loader.package.tests.common import ( check_snapshot, get_stats ) def test_cran_parse_date(): data = [ # parsable, some have debatable results though ('2001-June-08', datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)), ('Tue Dec 27 15:06:08 PST 2011', datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)), ('8-14-2013', datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)), ('2011-01', datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)), ('201109', datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)), ('04-12-2014', datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)), ('2018-08-24, 10:40:10', datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)), ('2013-October-16', datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)), ('Aug 23, 2013', datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)), ('27-11-2014', datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)), ('2019-09-26,', datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)), ('9/25/2014', datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)), ('Fri Jun 27 17:23:53 2014', datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)), ('28-04-2014', datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)), ('04-14-2014', datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)), ('2019-05-08 14:17:31 UTC', datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)), ('Wed May 21 13:50:39 CEST 2014', datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())), ('2018-04-10 00:01:04 KST', datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)), ('2019-08-25 10:45', datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)), ('March 9, 2015', datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)), ('Aug. 18, 2012', datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)), ('2014-Dec-17', datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)), ('March 01, 2013', datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)), ('2017-04-08.', datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)), ('2014-Apr-22', datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)), ('Mon Jan 12 19:54:04 2015', datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)), ('May 22, 2014', datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)), ('2014-08-12 09:55:10 EDT', datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)), # unparsable ('Fabruary 21, 2012', None), ('2019-05-28"', None), ('2017-03-01 today', None), ('2016-11-0110.1093/icesjms/fsw182', None), ('2019-07-010', None), ('2015-02.23', None), ('20013-12-30', None), ('2016-08-017', None), ('2019-02-07l', None), ('2018-05-010', None), ('2019-09-27 KST', None), ('$Date$', None), ('2019-09-27 KST', None), ('2019-06-22 $Date$', None), ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None), ('2015-7-013', None), ('2018-05-023', None), ("Check NEWS file for changes: news(package='simSummary')", None) ] for date, expected_date in data: actual_date = parse_date(date) assert actual_date == expected_date, f'input date to parse {date}' @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) # sample url # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa archive_path = path.join( datadir, 'https_cran.r-project.org', 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz') uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { 'Package': 'KernSmooth', 'Priority': 'recommended', 'Version': '2.22-6', 'Date': '2001-June-08', 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)', 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley .', # noqa 'Maintainer': 'Brian Ripley ', 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa 'License': 'Unlimited use and distribution (see LICENCE).', 'URL': 'http://www.biostat.harvard.edu/~mwand' } assert actual_metadata == expected_metadata @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" # inexistent first level path assert extract_intrinsic_metadata('/something-inexistent') == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} def test_cran_one_visit(swh_config, requests_mock_datadir): version = '2.22-6' base_url = 'https://cran.r-project.org' - url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa - loader = CRANLoader(url, version=version) + origin_url = f'{base_url}/Packages/Recommended_KernSmooth/index.html' + artifact_url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa + loader = CRANLoader(origin_url, artifacts=[{ + 'url': artifact_url, + 'version': version, + }]) actual_load_status = loader.load() expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, f'releases/{version}': { 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = next(loader.storage.origin_visit_get(origin_url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'cran' visit_stats = get_stats(loader.storage) assert { 'content': 33, 'directory': 7, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == visit_stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] # visited each artifact once across 2 visits assert len(urls) == 1 def test_cran_2_visits_same_origin( swh_config, requests_mock_datadir): """Multiple visits on the same origin, only 1 archive fetch""" version = '2.22-6' base_url = 'https://cran.r-project.org' - url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa - loader = CRANLoader(url, version=version) + origin_url = f'{base_url}/Packages/Recommended_KernSmooth/index.html' + artifact_url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa + loader = CRANLoader(origin_url, artifacts=[{ + 'url': artifact_url, + 'version': version + }]) # first visit actual_load_status = loader.load() expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, f'releases/{version}': { 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = next(loader.storage.origin_visit_get(origin_url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'cran' visit_stats = get_stats(loader.storage) assert { 'content': 33, 'directory': 7, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == visit_stats # second visit actual_load_status2 = loader.load() assert actual_load_status2 == { 'status': 'uneventful', 'snapshot_id': expected_snapshot_id } - origin_visit2 = next(loader.storage.origin_visit_get(url)) + origin_visit2 = next(loader.storage.origin_visit_get(origin_url)) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'cran' visit_stats2 = get_stats(loader.storage) visit_stats['origin_visit'] += 1 assert visit_stats2 == visit_stats, 'same stats as 1st visit, +1 visit' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] assert len(urls) == 1, 'visited one time artifact url (across 2 visits)' def test_parse_debian_control(datadir): description_file = os.path.join(datadir, 'description', 'acepack') actual_metadata = parse_debian_control(description_file) assert actual_metadata == { 'Package': 'acepack', 'Maintainer': 'Shawn Garbett', 'Version': '1.4.1', 'Author': 'Phil Spector, Jerome Friedman, Robert Tibshirani...', 'Description': 'Two nonparametric methods for multiple regression...', 'Title': 'ACE & AVAS 4 Selecting Multiple Regression Transformations', 'License': 'MIT + file LICENSE', 'Suggests': 'testthat', 'Packaged': '2016-10-28 15:38:59 UTC; garbetsp', 'Repository': 'CRAN', 'Date/Publication': '2016-10-29 00:11:52', 'NeedsCompilation': 'yes' } def test_parse_debian_control_unicode_issue(datadir): # iso-8859-1 caused failure, now fixed description_file = os.path.join( datadir, 'description', 'KnownBR' ) actual_metadata = parse_debian_control(description_file) assert actual_metadata == { 'Package': 'KnowBR', 'Version': '2.0', 'Title': '''Discriminating Well Surveyed Spatial Units from Exhaustive Biodiversity Databases''', 'Author': 'Cástor Guisande González and Jorge M. Lobo', 'Maintainer': 'Cástor Guisande González ', 'Description': 'It uses species accumulation curves and diverse estimators...', 'License': 'GPL (>= 2)', 'Encoding': 'latin1', 'Depends': 'R (>= 3.0), fossil, mgcv, plotrix, sp, vegan', 'Suggests': 'raster, rgbif', 'NeedsCompilation': 'no', 'Packaged': '2019-01-30 13:27:29 UTC; castor', 'Repository': 'CRAN', 'Date/Publication': '2019-01-31 20:53:50 UTC' } diff --git a/swh/loader/package/cran/tests/test_tasks.py b/swh/loader/package/cran/tests/test_tasks.py index cb42c22..28edb9a 100644 --- a/swh/loader/package/cran/tests/test_tasks.py +++ b/swh/loader/package/cran/tests/test_tasks.py @@ -1,20 +1,26 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def test_cran_loader(mocker, swh_app, celery_session_worker, swh_config): mock_loader = mocker.patch( 'swh.loader.package.cran.loader.CRANLoader.load') mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( 'swh.loader.package.cran.tasks.LoadCRAN', - kwargs={'url': 'some-url', 'version': '1.2.3'} + kwargs={ + 'url': 'some-url', + 'artifacts': { + 'version': '1.2.3', + 'url': 'artifact-url' + } + } ) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'}