diff --git a/swh/loader/package/functional/loader.py b/swh/loader/package/functional/loader.py index 1d861fd..381f190 100644 --- a/swh/loader/package/functional/loader.py +++ b/swh/loader/package/functional/loader.py @@ -1,85 +1,115 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import requests -from typing import Dict, Optional, Any +from typing import Dict, Optional, Any, Mapping + +from swh.model import hashutil from swh.model.model import ( Sha1Git, Revision, RevisionType ) from swh.loader.package.utils import EMPTY_AUTHOR from swh.loader.package.loader import PackageLoader def retrieve_sources(url: str) -> Dict[str, Any]: response = requests.get(url, allow_redirects=True) if response.status_code != 200: raise ValueError("Got %d HTTP code on %s", response.status_code, url) return json.loads(response.content.decode('utf-8')) class FunctionalLoader(PackageLoader): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = 'functional' def __init__(self, url): super().__init__(url=url) - self.sources = retrieve_sources(url)['sources'] + s = retrieve_sources(url) + self.sources = s['sources'] self.provider_url = url + # The revision used to create the sources.json file. For Nix, + # this revision belongs to the github.com/nixos/nixpkgs + # repository + self.revision = s['revision'] # Note: this could be renamed get_artifacts in the PackageLoader # base class. def get_versions(self): # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. return [s['url'][0] for s in self.sources] # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, source): # TODO: we need to provide the sha256 of the source also yield source, {'url': source, 'raw': {'url': source}} def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: for rev_id, known_artifact in known_artifacts.items(): known_url = known_artifact['extrinsic']['raw']['url'] if artifact_metadata['url'] == known_url: return rev_id return None + def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: + """We add a branch to the snapshot called 'evaluation' pointing to the + revision used to generate the sources.json file. This revision + is specified in the sources.json file itself. For the nixpkgs + origin, this revision is coming from the + github.com/nixos/nixpkgs repository. + + Note this repository is not loaded explicitly. So, this + pointer can target a nonexistent revision for a time. However, + the github and gnu loaders are supposed to load this revision + and should create the revision pointed by this branch. + + This branch can be used to identify the snapshot associated to + a Nix/Guix evaluation. + + """ + return { + b'evaluation': { + 'target_type': 'revision', + 'target': hashutil.hash_to_bytes(self.revision) + } + } + def build_revision(self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git) -> Optional[Revision]: return Revision( type=RevisionType.TAR, message=b'', author=EMPTY_AUTHOR, date=None, committer=EMPTY_AUTHOR, committer_date=None, parents=[], directory=directory, synthetic=True, metadata={ 'extrinsic': { 'provider': self.provider_url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, } ) diff --git a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json index 3395d01..2a21697 100644 --- a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json +++ b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json @@ -1,13 +1,14 @@ { "sources": [ { "type": "url", "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] }, { "type": "url", "url": [ "https://example.com/file.txt" ] } ], - "version": 1 + "version": 1, + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" } diff --git a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 index 127b7e2..2ad6cc5 100644 --- a/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 +++ b/swh/loader/package/functional/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 @@ -1,17 +1,18 @@ { "sources": [ { "type": "url", "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] }, { "type": "url", "url": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ] }, { "type": "url", "url": [ "https://example.com/file.txt" ] } ], - "version": 1 + "version": 1, + "revision": "602140776b2ce6c9159bcf52ada73a297c063d5e" } diff --git a/swh/loader/package/functional/tests/test_functional.py b/swh/loader/package/functional/tests/test_functional.py index 91a19ec..812384f 100644 --- a/swh/loader/package/functional/tests/test_functional.py +++ b/swh/loader/package/functional/tests/test_functional.py @@ -1,216 +1,252 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from json.decoder import JSONDecodeError from swh.loader.package.functional.loader import ( FunctionalLoader, retrieve_sources ) from swh.loader.package.tests.common import ( get_stats, check_snapshot ) sources_url = 'https://nix-community.github.io/nixpkgs-swh/sources.json' def test_retrieve_sources(swh_config, requests_mock_datadir): j = retrieve_sources(sources_url) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): FunctionalLoader('https://non-existing-url') def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): FunctionalLoader('https://example.com/file.txt') def test_loader_one_visit(swh_config, requests_mock_datadir): loader = FunctionalLoader(sources_url) res = loader.load() assert res['status'] == 'eventful' stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats origin_visit = next(loader.storage.origin_visit_get(sources_url)) # The visit is partial because urls pointing to non tarball file # are not handled yet assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'functional' def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = FunctionalLoader(sources_url) loader_status = loader.load() urls = [s['url'][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status['status'] == 'eventful' origin_visit = next(loader.storage.origin_visit_get(sources_url)) # The visit is partial because urls pointing to non tarball files # are not handled yet assert origin_visit['status'] == 'partial' def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = FunctionalLoader(sources_url) load_status = loader.load() loader = FunctionalLoader(sources_url) loader.load() - expected_snapshot_id = '2c7f01ef3115f7999a013979fa27bfa12dcb63eb' + expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { + 'evaluation': { + 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', + 'target_type': 'revision' + }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ('https://github.com/owner-1/repository-1/revision-1.tgz') ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = FunctionalLoader(sources_url) load_status = loader.load() - expected_snapshot_id = '2c7f01ef3115f7999a013979fa27bfa12dcb63eb' + expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { + 'evaluation': { + 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', + 'target_type': 'revision' + }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats loader = FunctionalLoader(sources_url) load_status = loader.load() - expected_snapshot_id = '9c4fbfd991b35c7de876cd66bcda2967a8f476ac' + expected_snapshot_id = 'b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_branches = { + 'evaluation': { + 'target': '602140776b2ce6c9159bcf52ada73a297c063d5e', + 'target_type': 'revision' + }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, 'https://github.com/owner-2/repository-1/revision-1.tgz': { 'target': '85e0bad74e33e390aaeb74f139853ae3863ee544', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 2, 'directory': 5, 'origin': 1, 'origin_visit': 2, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 2 } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir): loader = FunctionalLoader(sources_url) known_artifacts = { 'id1': {'extrinsic': {'raw': {'url': "url1"}}}, 'id2': {'extrinsic': {'raw': {'url': "url2"}}} } metadata = {'url': 'url1'} assert loader.resolve_revision_from(known_artifacts, metadata) == 'id1' metadata = {'url': 'url3'} assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa + + +def test_evaluation_branch(swh_config, requests_mock_datadir): + loader = FunctionalLoader(sources_url) + res = loader.load() + assert res['status'] == 'eventful' + + expected_branches = { + 'https://github.com/owner-1/repository-1/revision-1.tgz': { + 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', + 'target_type': 'revision', + }, + 'evaluation': { + 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', + 'target_type': 'revision', + }, + } + + expected_snapshot = { + 'id': '0c5881c74283793ebe9a09a105a9381e41380383', + 'branches': expected_branches, + } + + check_snapshot(expected_snapshot, storage=loader.storage)