diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py index de3a4d0..8e207c4 100644 --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,345 +1,345 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model.hashutil import hash_to_bytes from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) URL = 'https://ftp.gnu.org/gnu/8sync/' GNU_ARTIFACTS = [ { 'time': 944729610, 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'length': 221837, 'filename': '8sync-0.1.0.tar.gz', 'version': '0.1.0', } ] _expected_new_contents_first_visit = [ 'e9258d81faf5881a2f96a77ba609396f82cb97ad', '1170cf105b04b7e2822a0e09d2acf71da7b9a130', 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', '0057bec9b5422aff9256af240b177ac0e3ac2608', '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', 'd64e64d4c73679323f8d4cde2643331ba6c20af9', '7a756602914be889c0a2d3952c710144b3e64cb0', '84fb589b554fcb7f32b806951dcf19518d67b08f', '8624bcdae55baeef00cd11d5dfcfa60f68710a02', 'e08441aeab02704cfbd435d6445f7c072f8f524e', 'f67935bc3a83a67259cda4b2d43373bd56703844', '809788434b433eb2e3cfabd5d591c9a659d5e3d8', '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', 'b99fec102eb24bffd53ab61fc30d59e810f116a2', '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', 'f0c97052e567948adf03e641301e9983c478ccff', '7fb724242e2b62b85ca64190c31dcae5303e19b3', '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', '7350628ccf194c2c3afba4ac588c33e3f3ac778d', '0bb892d9391aa706dc2c3b1906567df43cbe06a2', '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', '3046e5d1f70297e2a507b98224b6222c9688d610', '1572607d456d7f633bc6065a2b3048496d679a31', ] _expected_new_directories_first_visit = [ 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', '263be23b4a8101d3ad0d9831319a3e0f2b065f36', '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', '4db0a3ecbc976083e2dac01a62f93729698429a3', 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', 'eca971d346ea54d95a6e19d5051f900237fafdaa', '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', ] _expected_new_revisions_first_visit = { '44183488c0774ce3c957fa19ba695cf18a4a42b3': '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' } _expected_branches_first_visit = { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0', }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', }, } # hash is different then before as we changed the snapshot # gnu used to use `release/` (singular) instead of plural _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa def visit_with_no_artifact_found(swh_config, requests_mock_datadir): url = URL unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz' loader = ArchiveLoader(url, artifacts=[ { 'time': 944729610, 'url': unknown_artifact_url, # unknown artifact 'length': 221837, 'filename': '8sync-0.1.0.tar.gz', 'version': '0.1.0', } ]) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'tar' def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '44183488c0774ce3c957fa19ba695cf18a4a42b3') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_release_artifact_no_prior_visit( swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] == _expected_new_snapshot_first_visit_id # noqa stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(loader.storage.content_missing_per_sha1(expected_contents)) \ == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(loader.storage.directory_missing(expected_dirs)) == [] expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) assert list(loader.storage.revision_missing(expected_revs)) == [] expected_snapshot = { 'id': _expected_new_snapshot_first_visit_id, 'branches': _expected_branches_first_visit, } check_snapshot(expected_snapshot, loader.storage) def test_2_visits_without_change(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' assert actual_load_status2['snapshot_id'] is not None assert actual_load_status['snapshot_id'] == actual_load_status2[ 'snapshot_id'] - origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + origin_visit2 = loader.storage.origin_visit_get_latest(url) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(url, [artifact1]) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 artifact2 = { 'time': 1480991830, 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, 'filename': '8sync-0.2.0.tar.gz', 'version': '0.2.0', } loader2 = ArchiveLoader(url, [artifact1, artifact2]) # implementation detail: share the storage in between visits loader2.storage = loader.storage stats2 = get_stats(loader2.storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2['status'] == 'eventful' assert actual_load_status2['snapshot_id'] is not None stats2 = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 8, 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 1, 'skipped_content': 0, 'snapshot': 1 + 1, } == stats2 - origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + origin_visit2 = loader.storage.origin_visit_get_latest(url) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot """ url = 'https://something.else.org/8sync/' artifacts = [ # this is not a gnu artifact { 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa # keep a gnu artifact reference to avoid adding other test files 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, 'filename': '8sync-0.2.0.tar.gz', 'version': '0.2.0', } ] # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( url, artifacts=artifacts, identity_artifact_keys=[ 'sha256', 'length', 'url']) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' assert actual_load_status2['snapshot_id'] == actual_load_status[ 'snapshot_id'] - origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + origin_visit2 = loader.storage.origin_visit_get_latest(url) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py index eae2c4a..730a3a3 100644 --- a/swh/loader/package/cran/tests/test_cran.py +++ b/swh/loader/package/cran/tests/test_cran.py @@ -1,330 +1,330 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest from datetime import datetime, timezone from dateutil.tz import tzlocal from os import path from swh.loader.package.cran.loader import ( extract_intrinsic_metadata, CRANLoader, parse_date, parse_debian_control ) from swh.core.tarball import uncompress from swh.model.model import TimestampWithTimezone from swh.loader.package.tests.common import ( check_snapshot, get_stats ) def test_cran_parse_date(): data = [ # parsable, some have debatable results though ('2001-June-08', datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)), ('Tue Dec 27 15:06:08 PST 2011', datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc)), ('8-14-2013', datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)), ('2011-01', datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)), ('201109', datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)), ('04-12-2014', datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)), ('2018-08-24, 10:40:10', datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc)), ('2013-October-16', datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)), ('Aug 23, 2013', datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)), ('27-11-2014', datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)), ('2019-09-26,', datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)), ('9/25/2014', datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)), ('Fri Jun 27 17:23:53 2014', datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc)), ('28-04-2014', datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)), ('04-14-2014', datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)), ('2019-05-08 14:17:31 UTC', datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc)), ('Wed May 21 13:50:39 CEST 2014', datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal())), ('2018-04-10 00:01:04 KST', datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc)), ('2019-08-25 10:45', datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)), ('March 9, 2015', datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)), ('Aug. 18, 2012', datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)), ('2014-Dec-17', datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)), ('March 01, 2013', datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)), ('2017-04-08.', datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)), ('2014-Apr-22', datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)), ('Mon Jan 12 19:54:04 2015', datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc)), ('May 22, 2014', datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)), ('2014-08-12 09:55:10 EDT', datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc)), # unparsable ('Fabruary 21, 2012', None), ('2019-05-28"', None), ('2017-03-01 today', None), ('2016-11-0110.1093/icesjms/fsw182', None), ('2019-07-010', None), ('2015-02.23', None), ('20013-12-30', None), ('2016-08-017', None), ('2019-02-07l', None), ('2018-05-010', None), ('2019-09-27 KST', None), ('$Date$', None), ('2019-09-27 KST', None), ('2019-06-22 $Date$', None), ('$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $', None), ('2015-7-013', None), ('2018-05-023', None), ("Check NEWS file for changes: news(package='simSummary')", None) ] for date, expected_date in data: actual_tstz = parse_date(date) if expected_date is None: assert actual_tstz is None, date else: expected_tstz = TimestampWithTimezone.from_datetime(expected_date) assert actual_tstz == expected_tstz, date @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) # sample url # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa archive_path = path.join( datadir, 'https_cran.r-project.org', 'src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz') uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { 'Package': 'KernSmooth', 'Priority': 'recommended', 'Version': '2.22-6', 'Date': '2001-June-08', 'Title': 'Functions for kernel smoothing for Wand & Jones (1995)', 'Author': 'S original by Matt Wand.\n\tR port by Brian Ripley .', # noqa 'Maintainer': 'Brian Ripley ', 'Description': 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa 'License': 'Unlimited use and distribution (see LICENCE).', 'URL': 'http://www.biostat.harvard.edu/~mwand' } assert actual_metadata == expected_metadata @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" # inexistent first level path assert extract_intrinsic_metadata('/something-inexistent') == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} def test_cran_one_visit(swh_config, requests_mock_datadir): version = '2.22-6' base_url = 'https://cran.r-project.org' origin_url = f'{base_url}/Packages/Recommended_KernSmooth/index.html' artifact_url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa loader = CRANLoader(origin_url, artifacts=[{ 'url': artifact_url, 'version': version, }]) actual_load_status = loader.load() expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, f'releases/{version}': { 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(origin_url)) + origin_visit = loader.storage.origin_visit_get_latest(origin_url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'cran' visit_stats = get_stats(loader.storage) assert { 'content': 33, 'directory': 7, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == visit_stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] # visited each artifact once across 2 visits assert len(urls) == 1 def test_cran_2_visits_same_origin( swh_config, requests_mock_datadir): """Multiple visits on the same origin, only 1 archive fetch""" version = '2.22-6' base_url = 'https://cran.r-project.org' origin_url = f'{base_url}/Packages/Recommended_KernSmooth/index.html' artifact_url = f'{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz' # noqa loader = CRANLoader(origin_url, artifacts=[{ 'url': artifact_url, 'version': version }]) # first visit actual_load_status = loader.load() expected_snapshot_id = '920adcccc78aaeedd3cfa4459dd900d8c3431a21' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': {'target': f'releases/{version}', 'target_type': 'alias'}, f'releases/{version}': { 'target': '42bdb16facd5140424359c8ce89a28ecfa1ce603', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(origin_url)) + origin_visit = loader.storage.origin_visit_get_latest(origin_url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'cran' visit_stats = get_stats(loader.storage) assert { 'content': 33, 'directory': 7, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == visit_stats # second visit actual_load_status2 = loader.load() assert actual_load_status2 == { 'status': 'uneventful', 'snapshot_id': expected_snapshot_id } - origin_visit2 = next(loader.storage.origin_visit_get(origin_url)) + origin_visit2 = loader.storage.origin_visit_get_latest(origin_url) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'cran' visit_stats2 = get_stats(loader.storage) visit_stats['origin_visit'] += 1 assert visit_stats2 == visit_stats, 'same stats as 1st visit, +1 visit' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] assert len(urls) == 1, 'visited one time artifact url (across 2 visits)' def test_parse_debian_control(datadir): description_file = os.path.join(datadir, 'description', 'acepack') actual_metadata = parse_debian_control(description_file) assert actual_metadata == { 'Package': 'acepack', 'Maintainer': 'Shawn Garbett', 'Version': '1.4.1', 'Author': 'Phil Spector, Jerome Friedman, Robert Tibshirani...', 'Description': 'Two nonparametric methods for multiple regression...', 'Title': 'ACE & AVAS 4 Selecting Multiple Regression Transformations', 'License': 'MIT + file LICENSE', 'Suggests': 'testthat', 'Packaged': '2016-10-28 15:38:59 UTC; garbetsp', 'Repository': 'CRAN', 'Date/Publication': '2016-10-29 00:11:52', 'NeedsCompilation': 'yes' } def test_parse_debian_control_unicode_issue(datadir): # iso-8859-1 caused failure, now fixed description_file = os.path.join( datadir, 'description', 'KnownBR' ) actual_metadata = parse_debian_control(description_file) assert actual_metadata == { 'Package': 'KnowBR', 'Version': '2.0', 'Title': '''Discriminating Well Surveyed Spatial Units from Exhaustive Biodiversity Databases''', 'Author': 'Cástor Guisande González and Jorge M. Lobo', 'Maintainer': 'Cástor Guisande González ', 'Description': 'It uses species accumulation curves and diverse estimators...', 'License': 'GPL (>= 2)', 'Encoding': 'latin1', 'Depends': 'R (>= 3.0), fossil, mgcv, plotrix, sp, vegan', 'Suggests': 'raster, rgbif', 'NeedsCompilation': 'no', 'Packaged': '2019-01-30 13:27:29 UTC; castor', 'Repository': 'CRAN', 'Date/Publication': '2019-01-31 20:53:50 UTC' } diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py index f4e3307..f48544b 100644 --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -1,464 +1,464 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import pytest import random from os import path from swh.loader.package.debian.loader import ( DebianLoader, download_package, dsc_information, uid_to_person, prepare_person, get_package_metadata, extract_package ) from swh.loader.package.tests.common import check_snapshot, get_stats from swh.loader.package.debian.loader import resolve_revision_from from swh.model.model import Person logger = logging.getLogger(__name__) PACKAGE_FILES = { 'name': 'cicero', 'version': '0.7.2-3', 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa 'size': 3964, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa 'size': 1864, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa 'size': 96527, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa } }, } PACKAGE_FILES2 = { 'name': 'cicero', 'version': '0.7.2-4', 'files': { 'cicero_0.7.2-4.diff.gz': { 'md5sum': '1e7e6fc4a59d57c98082a3af78145734', 'name': 'cicero_0.7.2-4.diff.gz', 'sha256': '2e6fa296ee7005473ff58d0971f4fd325617b445671480e9f2cfb738d5dbcd01', # noqa 'size': 4038, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.diff.gz' # noqa }, 'cicero_0.7.2-4.dsc': { 'md5sum': '1a6c8855a73b4282bb31d15518f18cde', 'name': 'cicero_0.7.2-4.dsc', 'sha256': '913ee52f7093913420de5cbe95d63cfa817f1a1daf997961149501894e754f8b', # noqa 'size': 1881, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.dsc'}, # noqa 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa 'size': 96527, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa } } } PACKAGE_PER_VERSION = { 'stretch/contrib/0.7.2-3': PACKAGE_FILES, } PACKAGES_PER_VERSION = { 'stretch/contrib/0.7.2-3': PACKAGE_FILES, 'buster/contrib/0.7.2-4': PACKAGE_FILES2, } def test_debian_first_visit( swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = DebianLoader( url='deb://Debian/packages/cicero', date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGE_PER_VERSION) actual_load_status = loader.load() expected_snapshot_id = '3b6b66e6ee4e7d903a379a882684a2a50480c0b4' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 42, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, # all artifacts under 1 revision 'skipped_content': 0, 'snapshot': 1 } == stats expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', } }, } # different than the previous loader as no release is done check_snapshot(expected_snapshot, loader.storage) def test_debian_first_visit_then_another_visit( swh_config, requests_mock_datadir): """With no prior visit, load a debian project ends up with 1 snapshot """ url = 'deb://Debian/packages/cicero' loader = DebianLoader( url=url, date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGE_PER_VERSION) actual_load_status = loader.load() expected_snapshot_id = '3b6b66e6ee4e7d903a379a882684a2a50480c0b4' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deb' stats = get_stats(loader.storage) assert { 'content': 42, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, # all artifacts under 1 revision 'skipped_content': 0, 'snapshot': 1 } == stats expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', } }, } # different than the previous loader as no release is done check_snapshot(expected_snapshot, loader.storage) # No change in between load actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' - origin_visit2 = list(loader.storage.origin_visit_get(url)) - assert origin_visit2[-1]['status'] == 'full' - assert origin_visit2[-1]['type'] == 'deb' + origin_visit2 = loader.storage.origin_visit_get_latest(url) + assert origin_visit2['status'] == 'full' + assert origin_visit2['type'] == 'deb' stats2 = get_stats(loader.storage) assert { 'content': 42 + 0, 'directory': 2 + 0, 'origin': 1, 'origin_visit': 1 + 1, # a new visit occurred 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, # same snapshot across 2 visits } == stats2 urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('http://deb.debian.org') ] # visited each package artifact twice across 2 visits assert len(urls) == len(set(urls)) def test_uid_to_person(): uid = 'Someone Name ' actual_person = uid_to_person(uid) assert actual_person == { 'name': 'Someone Name', 'email': 'someone@orga.org', 'fullname': uid, } def test_prepare_person(): actual_author = prepare_person({ 'name': 'Someone Name', 'email': 'someone@orga.org', 'fullname': 'Someone Name ', }) assert actual_author == Person( name=b'Someone Name', email=b'someone@orga.org', fullname=b'Someone Name ', ) def test_download_package(datadir, tmpdir, requests_mock_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) all_hashes = download_package(PACKAGE_FILES, tmpdir) assert all_hashes == { 'cicero_0.7.2-3.diff.gz': { 'checksums': { 'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa }, 'filename': 'cicero_0.7.2-3.diff.gz', 'length': 3964}, 'cicero_0.7.2-3.dsc': { 'checksums': { 'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa }, 'filename': 'cicero_0.7.2-3.dsc', 'length': 1864}, 'cicero_0.7.2.orig.tar.gz': { 'checksums': { 'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa }, 'filename': 'cicero_0.7.2.orig.tar.gz', 'length': 96527 } } def test_dsc_information_ok(): fname = 'cicero_0.7.2-3.dsc' dsc_url, dsc_name = dsc_information(PACKAGE_FILES) assert dsc_url == PACKAGE_FILES['files'][fname]['uri'] assert dsc_name == PACKAGE_FILES['files'][fname]['name'] def test_dsc_information_not_found(): fname = 'cicero_0.7.2-3.dsc' package_files = copy.deepcopy(PACKAGE_FILES) package_files['files'].pop(fname) dsc_url, dsc_name = dsc_information(package_files) assert dsc_url is None assert dsc_name is None def test_dsc_information_too_many_dsc_entries(): # craft an extra dsc file fname = 'cicero_0.7.2-3.dsc' package_files = copy.deepcopy(PACKAGE_FILES) data = package_files['files'][fname] fname2 = fname.replace('cicero', 'ciceroo') package_files['files'][fname2] = data with pytest.raises( ValueError, match='Package %s_%s references several dsc' % ( package_files['name'], package_files['version'])): dsc_information(package_files) def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path): tmp_path = str(tmp_path) # py3.5 compat. package = PACKAGE_FILES logger.debug('package: %s', package) # download the packages all_hashes = download_package(package, tmp_path) # Retrieve information from package _, dsc_name = dsc_information(package) dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()] # Extract information from package extracted_path = extract_package(dl_artifacts, tmp_path) # Retrieve information on package dsc_path = path.join(path.dirname(extracted_path), dsc_name) actual_package_info = get_package_metadata( package, dsc_path, extracted_path) logger.debug('actual_package_info: %s', actual_package_info) assert actual_package_info == { 'changelog': { 'date': '2014-10-19T16:52:35+02:00', 'history': [ ('cicero', '0.7.2-2'), ('cicero', '0.7.2-1'), ('cicero', '0.7-1') ], 'person': { 'email': 'sthibault@debian.org', 'fullname': 'Samuel Thibault ', 'name': 'Samuel Thibault' } }, 'maintainers': [ { 'email': 'debian-accessibility@lists.debian.org', 'fullname': 'Debian Accessibility Team ' '', 'name': 'Debian Accessibility Team' }, { 'email': 'sthibault@debian.org', 'fullname': 'Samuel Thibault ', 'name': 'Samuel Thibault' } ], 'name': 'cicero', 'version': '0.7.2-3' } def test_debian_multiple_packages(swh_config, requests_mock_datadir): url = 'deb://Debian/packages/cicero' loader = DebianLoader( url=url, date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGES_PER_VERSION) actual_load_status = loader.load() expected_snapshot_id = 'defc19021187f3727293121fcf6c5c82cb923604' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deb' expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', }, 'releases/buster/contrib/0.7.2-4': { 'target_type': 'revision', 'target': '8224139c274c984147ef4b09aa0e462c55a10bd3', } }, } check_snapshot(expected_snapshot, loader.storage) def test_resolve_revision_from_edge_cases(): """Solving revision with empty data will result in unknown revision """ for package_artifacts in [{}, PACKAGE_FILES]: actual_revision = resolve_revision_from( package_artifacts, {}) assert actual_revision is None for known_artifacts in [{}, PACKAGE_FILES]: actual_revision = resolve_revision_from( {}, known_artifacts) assert actual_revision is None known_package_artifacts = { b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07": { 'extrinsic': { # empty }, # ... removed the unnecessary intermediary data } } assert not resolve_revision_from(known_package_artifacts, PACKAGE_FILES) def test_resolve_revision_from_edge_cases_hit_and_miss(): """Solving revision with inconsistent data will result in unknown revision """ artifact_metadata = PACKAGE_FILES2 expected_revision_id = b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa known_package_artifacts = { expected_revision_id: { 'extrinsic': { 'raw': PACKAGE_FILES, }, # ... removed the unnecessary intermediary data } } actual_revision = resolve_revision_from( known_package_artifacts, artifact_metadata ) assert actual_revision is None def test_resolve_revision_from(): """Solving revision with consistent data will solve the revision """ artifact_metadata = PACKAGE_FILES expected_revision_id = b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa files = artifact_metadata['files'] # shuffling dict's keys keys = list(files.keys()) random.shuffle(keys) package_files = { 'files': {k: files[k] for k in keys} } known_package_artifacts = { expected_revision_id: { 'extrinsic': { 'raw': package_files, }, # ... removed the unnecessary intermediary data } } actual_revision = resolve_revision_from( known_package_artifacts, artifact_metadata ) assert actual_revision == expected_revision_id diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index 0ec8922..102e1b5 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,205 +1,205 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from swh.model.hashutil import hash_to_bytes from swh.loader.package.deposit.loader import DepositLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) from swh.core.pytest_plugin import requests_mock_datadir_factory def test_deposit_init_ok(swh_config, swh_loader_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config['deposit']['url'] def test_deposit_loading_unknown_deposit( swh_config, requests_mock_datadir): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url' unknown_deposit_id = 667 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 0, 'origin_visit': 0, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://deposit.softwareheritage.org/1/private/666/raw/', ]) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url-2' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'deposit' def test_revision_metadata_structure(swh_config, requests_mock_datadir): # do not care for deposit update query requests_mock_datadir.put(re.compile('https')) url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '637318680351f5d78856d13264faebbd91efe9bb') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_deposit_loading_ok(swh_config, requests_mock_datadir): requests_mock_datadir.put(re.compile('https')) # do not care for put url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = 'b2b327b33dc85818bd23c3ccda8b7e675a66ecbd' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 303, 'directory': 12, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deposit' expected_branches = { 'HEAD': { 'target': '637318680351f5d78856d13264faebbd91efe9bb', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) # check metadata tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2", } } tool = loader.storage.tool_get(tool) assert tool is not None assert tool['id'] is not None provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } provider = loader.storage.metadata_provider_get_by(provider) assert provider is not None assert provider['id'] is not None metadata = list(loader.storage.origin_metadata_get_by( url, provider_type='deposit_client')) assert metadata is not None assert isinstance(metadata, list) assert len(metadata) == 1 metadata0 = metadata[0] assert metadata0['provider_id'] == provider['id'] assert metadata0['provider_type'] == 'deposit_client' assert metadata0['tool_id'] == tool['id'] diff --git a/swh/loader/package/functional/tests/test_functional.py b/swh/loader/package/functional/tests/test_functional.py index 35bf10d..408d8af 100644 --- a/swh/loader/package/functional/tests/test_functional.py +++ b/swh/loader/package/functional/tests/test_functional.py @@ -1,276 +1,276 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from json.decoder import JSONDecodeError from swh.loader.package.functional.loader import ( FunctionalLoader, retrieve_sources ) from swh.loader.package.tests.common import ( get_stats, check_snapshot ) sources_url = 'https://nix-community.github.io/nixpkgs-swh/sources.json' def test_retrieve_sources(swh_config, requests_mock_datadir): j = retrieve_sources(sources_url) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): FunctionalLoader('https://non-existing-url') def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): FunctionalLoader('https://example.com/file.txt') def test_loader_one_visit(swh_config, requests_mock_datadir): loader = FunctionalLoader(sources_url) res = loader.load() assert res['status'] == 'eventful' stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats - origin_visit = next(loader.storage.origin_visit_get(sources_url)) + origin_visit = loader.storage.origin_visit_get_latest(sources_url) # The visit is partial because urls pointing to non tarball file # are not handled yet assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'functional' def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = FunctionalLoader(sources_url) loader_status = loader.load() urls = [s['url'][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status['status'] == 'eventful' - origin_visit = next(loader.storage.origin_visit_get(sources_url)) + origin_visit = loader.storage.origin_visit_get_latest(sources_url) # The visit is partial because urls pointing to non tarball files # are not handled yet assert origin_visit['status'] == 'partial' def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = FunctionalLoader(sources_url) load_status = loader.load() loader = FunctionalLoader(sources_url) loader.load() expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ('https://github.com/owner-1/repository-1/revision-1.tgz') ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = FunctionalLoader(sources_url) load_status = loader.load() expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats loader = FunctionalLoader(sources_url) load_status = loader.load() expected_snapshot_id = 'b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_branches = { 'evaluation': { 'target': '602140776b2ce6c9159bcf52ada73a297c063d5e', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, 'https://github.com/owner-2/repository-1/revision-1.tgz': { 'target': '85e0bad74e33e390aaeb74f139853ae3863ee544', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 2, 'directory': 5, 'origin': 1, 'origin_visit': 2, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 2 } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir): loader = FunctionalLoader(sources_url) known_artifacts = { 'id1': {'extrinsic': {'raw': {'url': "url1"}}}, 'id2': {'extrinsic': {'raw': {'url': "url2"}}} } metadata = {'url': 'url1'} assert loader.resolve_revision_from(known_artifacts, metadata) == 'id1' metadata = {'url': 'url3'} assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): loader = FunctionalLoader(sources_url) res = loader.load() assert res['status'] == 'eventful' expected_branches = { 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision', }, 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision', }, } expected_snapshot = { 'id': '0c5881c74283793ebe9a09a105a9381e41380383', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) def test_eoferror(swh_config, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa loader = FunctionalLoader(sources) loader.load() expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision', }, } expected_snapshot = { 'id': '4257fa2350168c6bfec726a06452ea27a2c0cb33', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index 495275e..cc2ae32 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,608 +1,608 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import pytest from swh.model.hashutil import hash_to_bytes from swh.model.model import Person from swh.loader.package.npm.loader import ( NpmLoader, extract_npm_package_author, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) def test_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, 'https_replicate.npmjs.com', 'org_visit1') with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ Person( fullname=b'mooz ', name=b'mooz', email=b'stillpedant@gmail.com' ) assert ( extract_npm_package_author(package_metadata['versions']['0.0.3']) == Person( fullname=b'Masafumi Oyamada ', name=b'Masafumi Oyamada', email=b'stillpedant@gmail.com' ) ) package_json = json.loads(''' { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }''') # noqa assert extract_npm_package_author(package_json) == \ Person( fullname=b'Yauheni Pakala ', name=b'Yauheni Pakala', email=b'evgeniy.pakalo@gmail.com' ) package_json = json.loads(''' { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'Shawn Walsh', name=b'Shawn Walsh', email=None ) package_json = json.loads(''' { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'fengmk2 (https://fengmk2.com)', name=b'fengmk2', email=b'fengmk2@gmail.com', ) package_json = json.loads(''' { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'xiaohuoni <448627663@qq.com>', name=b'xiaohuoni', email=b'448627663@qq.com' ) def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes([ '4ce3058e16ab3d7e077f65aabf855c34895bf17c', '858c3ceee84c8311adc808f8cdb30d233ddc9d18', '0fa33b4f5a4e0496da6843a38ff1af8b61541996', '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', '9163ac8025923d5a45aaac482262893955c9b37b', '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', '18c03aac6d3e910efb20039c15d70ab5e0297101', '41265c42446aac17ca769e67d1704f99e5a1394d', '783ff33f5882813dca9239452c4a7cadd4dba778', 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', '112d1900b4c2e3e9351050d1b542c9744f9793f3', '5439bbc4bd9a996f1a38244e6892b71850bc98fd', 'd83097a2f994b503185adf4e719d154123150159', 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', 'b3523a26f7147e4af40d9d462adaae6d49eda13e', 'cd065fb435d6fb204a8871bcd623d0d0e673088c', '2854a40855ad839a54f4b08f5cff0cf52fca4399', 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', '585fc5caab9ead178a327d3660d35851db713df1', 'e8cd41a48d79101977e3036a87aeb1aac730686f', '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', '9c3cc2763bf9e9e37067d3607302c4776502df98', '3649a68410e354c83cd4a38b66bd314de4c8f5c9', 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', '38de737da99514de6559ff163c988198bc91367a', ]) _expected_new_directories_first_visit = normalize_hashes([ '3370d20d6f96dc1c9e50f083e2134881db110f4f', '42753c0c2ab00c4501b552ac4671c68f3cf5aece', 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', '80579be563e2ef3e385226fe7a3f079b377f142c', '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'bcad03ce58ac136f26f000990fc9064e559fe1c0', '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', '584b5b4b6cf7f038095e820b99386a9c232de931', '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', 'bb5f4ee143c970367eb409f2e4c1104898048b9d', '1b95491047add1103db0dfdfa84a9735dcb11e88', 'a00c6de13471a2d66e64aca140ddb21ef5521e62', '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', '775cc516543be86c15c1dc172f49c0d4e6e78235', 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', ]) _expected_new_revisions_first_visit = normalize_hashes({ 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': '42753c0c2ab00c4501b552ac4671c68f3cf5aece', '5f9eb78af37ffd12949f235e86fac04898f9f72a': '3370d20d6f96dc1c9e50f083e2134881db110f4f', 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} ) def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def test_revision_metadata_structure(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'd8a1c7474d2956ac598a19f0f27d52f7015f117e') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_npm_loader_first_visit(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() expected_snapshot_id = 'd0587e1195aed5a8800411a008f2f2d627f18e2d' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats assert len(list(loader.storage.content_get( _expected_new_contents_first_visit))) == len( _expected_new_contents_first_visit) assert list(loader.storage.directory_missing( _expected_new_directories_first_visit)) == [] assert list(loader.storage.revision_missing( _expected_new_revisions_first_visit)) == [] expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': { 'target': 'releases/0.0.4', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) def test_npm_loader_incremental_visit( swh_config, requests_mock_datadir_visits): package = 'org' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats loader._info = None # reset loader internal state actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'eventful' snap_id2 = actual_load_status2['snapshot_id'] assert snap_id2 is not None assert snap_id2 != actual_load_status['snapshot_id'] - origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + origin_visit2 = loader.storage.origin_visit_get_latest(url) assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'npm' stats = get_stats(loader.storage) assert { # 3 new releases artifacts 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 15, 'origin': 1, 'origin_visit': 2, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 3, 'skipped_content': 0, 'snapshot': 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://registry.npmjs.org') ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures('requests_mock_datadir') def test_npm_loader_version_divergence(swh_config): package = '@aller_shared' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { # 1 new releases artifacts 'content': 534, 'directory': 153, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1, } == stats expected_snapshot = { 'id': 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0' }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '845673bfe8cbd31b1eaf757745a964137e6f9116', }, 'releases/0.1.1-alpha.14': { 'target_type': 'revision', 'target': '05181c12cd8c22035dd31155656826b85745da37', }, }, } check_snapshot(expected_snapshot, loader.storage) def test_npm_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', }, } known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': {}, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_npm_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'package_source': { 'sha1': "something-wrong" } }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_npm_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha1': "05181c12cd8c22035dd31155656826b85745da37" }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha1': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') def test_npm_artifact_with_no_intrinsic_metadata( swh_config, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = 'nativescript-telerik-analytics' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', 'branches': {}, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' def test_npm_artifact_with_no_upload_time(swh_config, requests_mock_datadir): """With no time upload, artifact is skipped """ package = 'jammit-no-time' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', 'branches': {}, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'npm' def test_npm_artifact_use_mtime_if_no_time(swh_config, requests_mock_datadir): """With no time upload, artifact is skipped """ package = 'jammit-express' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' # artifact is used expected_snapshot = { 'id': 'd6e08e19159f77983242877c373c75222d5ae9dd', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.0.1' }, 'releases/0.0.1': { 'target_type': 'revision', 'target': '9e4dd2b40d1b46b70917c0949aa2195c823a648e', } } } check_snapshot(expected_snapshot, loader.storage) - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' def test_npm_no_artifact(swh_config, requests_mock_datadir): """If no artifacts at all is found for origin, the visit fails completely """ package = 'catify' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status == { 'status': 'failed', } origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'npm' diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py index 6979813..9c38c39 100644 --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -1,834 +1,834 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from os import path import pytest from unittest.mock import patch from swh.core.tarball import uncompress from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.model.hashutil import hash_to_bytes from swh.model.model import Person from swh.loader.package.pypi.loader import ( PyPILoader, pypi_api_url, author, extract_intrinsic_metadata, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = Person( fullname=b'i-am-groot ', name=b'i-am-groot', email=b'iam@groot.org', ) assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = Person( fullname=b'i-am-groot', name=b'i-am-groot', email=b'', ) assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = Person( fullname=b' ', name=b'', email=b'iam@groot.org', ) assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = Person( fullname=b"['pierre', 'paul', 'jacques']", name=b"['pierre', 'paul', 'jacques']", email=None, ) assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = Person( fullname=b'[marie, jeanne] <[marie@some, jeanne@thing]>', name=b'[marie, jeanne]', email=b'[marie@some, jeanne@thing]', ) assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = Person( fullname=( b'[marie, jeanne, pierre] ' b'<[marie@somewhere.org, jeanne@somewhere.org]>' ), name=b'[marie, jeanne, pierre]', email=b'[marie@somewhere.org, jeanne@somewhere.org]', ) actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_api_url_with_slash(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests/') assert url == 'https://pypi.org/pypi/requests/json' @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_metadata == expected_metadata @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path assert extract_intrinsic_metadata('/something-inexistent') == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = path.join(tmp_path, 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa ]) def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: # {visit: partial, status: uneventful, no snapshot} def test_release_with_traceback(swh_config, requests_mock_datadir): url = 'https://pypi.org/project/0805nexter' with patch('swh.loader.package.pypi.loader.PyPILoader.last_snapshot', side_effect=ValueError('Fake problem to fail the visit')): loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa ]) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_revision_metadata_structure(swh_config, requests_mock_datadir): url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'e445da4da22b31bfebb6ffc4383dbf839a074d21') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_missing_artifact( swh_config, requests_mock_datadir_missing_one): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'dd0e4201a232b1c104433741dbf45895b8ac9355' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 3, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' actual_load_status2 = loader.load() assert actual_load_status2 == { 'status': 'uneventful', 'snapshot_id': actual_load_status2['snapshot_id'] } stats2 = get_stats(loader.storage) expected_stats2 = stats.copy() expected_stats2['origin_visit'] = 1 + 1 assert expected_stats2 == stats2 # same snapshot actual_snapshot_id = origin_visit['snapshot'] assert actual_snapshot_id == hash_to_bytes(snapshot_id) def test_incremental_visit(swh_config, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(loader.storage) expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert visit1_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } - origin_visit1 = next(loader.storage.origin_visit_get(url)) + origin_visit1 = loader.storage.origin_visit_get_latest(url) assert origin_visit1['status'] == 'full' assert origin_visit1['type'] == 'pypi' assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == visit1_stats # Reset internal state loader._info = None visit2_actual_load_status = loader.load() visit2_stats = get_stats(loader.storage) assert visit2_actual_load_status['status'] == 'eventful' expected_snapshot_id2 = '2e5149a7b0725d18231a37b342e9b7c4e121f283' assert visit2_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id2 } visits = list(loader.storage.origin_visit_get(url)) assert len(visits) == 2 assert visits[1]['status'] == 'full' assert visits[1]['type'] == 'pypi' assert { 'content': 6 + 1, # 1 more content 'directory': 4 + 2, # 2 more directories 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': 2 + 1, # 1 more revision 'skipped_content': 0, 'snapshot': 1 + 1, # 1 more snapshot } == visit2_stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', '92689fa2b7fb4d4fc6fb195bf73a50c87c030639' ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', '52604d46843b898f5a43208045d09fcf8731631b', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'releases/1.3.0': { 'target': '51247143b01445c9348afa9edfae31bf7c5d86b1', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.3.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id2, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = list(loader.storage.origin_visit_get(url))[-1] + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://files.pythonhosted.org') ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls)) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} # release with multiple sdist artifacts per pypi "version" # snapshot branch output is different def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } expected_branches = { 'releases/1.1.0/nexter-1.1.0.zip': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.1.0/nexter-1.1.0.tar.gz': { 'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_pypi_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } assert artifact_to_revision_id({}, artifact_metadata) is None known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': { 'original_artifact': { 'sha256': 'something-irrelevant', }, }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_pypi_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': { 'sha256': "something-wrong", }, }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_pypi_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha256': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') def test_pypi_artifact_to_revision_id_failures(): with pytest.raises(KeyError, match='sha256'): artifact_metadata = { 'digests': {}, } assert artifact_to_revision_id({}, artifact_metadata) with pytest.raises(KeyError, match='digests'): artifact_metadata = { 'something': 'wrong', } assert artifact_to_revision_id({}, artifact_metadata) def test_pypi_artifact_with_no_intrinsic_metadata( swh_config, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ url = 'https://pypi.org/project/upymenu' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': expected_snapshot_id, 'branches': {} } check_snapshot(expected_snapshot, loader.storage) - origin_visit = next(loader.storage.origin_visit_get(url)) + origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi'