diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py index 54c94ec..752ad45 100644 --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,115 +1,127 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import requests from typing import Dict, Optional, Any, Mapping from swh.model import hashutil from swh.model.model import ( Sha1Git, Revision, RevisionType ) from swh.loader.package.utils import EMPTY_AUTHOR from swh.loader.package.loader import PackageLoader def retrieve_sources(url: str) -> Dict[str, Any]: response = requests.get(url, allow_redirects=True) if response.status_code != 200: raise ValueError("Got %d HTTP code on %s", response.status_code, url) return json.loads(response.content.decode('utf-8')) class NixGuixLoader(PackageLoader): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = 'nixguix' def __init__(self, url): super().__init__(url=url) s = retrieve_sources(url) self.sources = s['sources'] self.provider_url = url + + self._integrityByUrl = {s['url'][0]: s['integrity'] + for s in self.sources} + # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository self.revision = s['revision'] # Note: this could be renamed get_artifacts in the PackageLoader # base class. def get_versions(self): + """The first mirror of the mirror list is used as branch name in the + snapshot. + + """ + return self._integrityByUrl.keys() + + # Note: this could be renamed get_artifact_info in the PackageLoader + # base class. + def get_package_info(self, url): # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. - return [s['url'][0] for s in self.sources] - - # Note: this could be renamed get_artifact_info in the PackageLoader - # base class. - def get_package_info(self, source): - # TODO: we need to provide the sha256 of the source also - yield source, {'url': source, 'raw': {'url': source}} + integrity = self._integrityByUrl[url] + yield url, {'url': url, + 'raw': { + 'url': url, + 'integrity': integrity}} def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: + for rev_id, known_artifact in known_artifacts.items(): - known_url = known_artifact['extrinsic']['raw']['url'] - if artifact_metadata['url'] == known_url: + known_integrity = known_artifact['extrinsic']['raw']['integrity'] + if artifact_metadata['integrity'] == known_integrity: return rev_id return None def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision is specified in the sources.json file itself. For the nixpkgs origin, this revision is coming from the github.com/nixos/nixpkgs repository. Note this repository is not loaded explicitly. So, this pointer can target a nonexistent revision for a time. However, the github and gnu loaders are supposed to load this revision and should create the revision pointed by this branch. This branch can be used to identify the snapshot associated to a Nix/Guix evaluation. """ return { b'evaluation': { 'target_type': 'revision', 'target': hashutil.hash_to_bytes(self.revision) } } def build_revision(self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git) -> Optional[Revision]: return Revision( type=RevisionType.TAR, message=b'', author=EMPTY_AUTHOR, date=None, committer=EMPTY_AUTHOR, committer_date=None, parents=[], directory=directory, synthetic=True, metadata={ 'extrinsic': { 'provider': self.provider_url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, } ) diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json index ebc2b88..4a1a1fd 100644 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json @@ -1,10 +1,11 @@ { "sources": [ { "type": "url", - "url": [ "https://fail.com/truncated-archive.tgz" ] + "url": [ "https://fail.com/truncated-archive.tgz" ], + "integrity": "sha256-UB+RzIn63O0WxzqohYeWZRRzYCxyK7Kfhqi6WI0P8bE=" } ], "version": 1, "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" } diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json index 2a21697..3ec8007 100644 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json @@ -1,14 +1,16 @@ { "sources": [ { "type": "url", - "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] + "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", - "url": [ "https://example.com/file.txt" ] + "url": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" } ], "version": 1, "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" } diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 index 2ad6cc5..2eacdb6 100644 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 @@ -1,18 +1,21 @@ { "sources": [ { "type": "url", - "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ] + "url": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", - "url": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ] + "url": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ], + "integrity": "sha256-+vRlzTcnhMlynJGGMuAgMnUGdjpSqGabhcQ/SlRplAE=" }, { "type": "url", - "url": [ "https://example.com/file.txt" ] + "url": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" } ], "version": 1, "revision": "602140776b2ce6c9159bcf52ada73a297c063d5e" } diff --git a/swh/loader/package/nixguix/tests/test_functional.py b/swh/loader/package/nixguix/tests/test_functional.py index b7a274d..d8220ce 100644 --- a/swh/loader/package/nixguix/tests/test_functional.py +++ b/swh/loader/package/nixguix/tests/test_functional.py @@ -1,276 +1,279 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from json.decoder import JSONDecodeError from swh.loader.package.nixguix.loader import ( NixGuixLoader, retrieve_sources ) from swh.loader.package.tests.common import ( get_stats, check_snapshot ) sources_url = 'https://nix-community.github.io/nixpkgs-swh/sources.json' def test_retrieve_sources(swh_config, requests_mock_datadir): j = retrieve_sources(sources_url) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): NixGuixLoader('https://non-existing-url') def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): NixGuixLoader('https://example.com/file.txt') def test_loader_one_visit(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res['status'] == 'eventful' stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats origin_visit = loader.storage.origin_visit_get_latest(sources_url) # The visit is partial because urls pointing to non tarball file # are not handled yet assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'nixguix' def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = NixGuixLoader(sources_url) loader_status = loader.load() urls = [s['url'][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status['status'] == 'eventful' origin_visit = loader.storage.origin_visit_get_latest(sources_url) # The visit is partial because urls pointing to non tarball files # are not handled yet assert origin_visit['status'] == 'partial' def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = NixGuixLoader(sources_url) load_status = loader.load() - loader = NixGuixLoader(sources_url) loader.load() expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ('https://github.com/owner-1/repository-1/revision-1.tgz') ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id = '0c5881c74283793ebe9a09a105a9381e41380383' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 1, 'directory': 3, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id = 'b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97' assert load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_branches = { 'evaluation': { 'target': '602140776b2ce6c9159bcf52ada73a297c063d5e', 'target_type': 'revision' }, 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision' }, 'https://github.com/owner-2/repository-1/revision-1.tgz': { 'target': '85e0bad74e33e390aaeb74f139853ae3863ee544', 'target_type': 'revision' } } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { 'content': 2, 'directory': 5, 'origin': 1, 'origin_visit': 2, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 2 } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) known_artifacts = { - 'id1': {'extrinsic': {'raw': {'url': "url1"}}}, - 'id2': {'extrinsic': {'raw': {'url': "url2"}}} + 'id1': {'extrinsic': {'raw': { + 'url': "url1", + 'integrity': 'integrity1'}}}, + 'id2': {'extrinsic': {'raw': { + 'url': "url2", + 'integrity': 'integrity2'}}}, } - metadata = {'url': 'url1'} + metadata = {'url': 'url1', 'integrity': 'integrity1'} assert loader.resolve_revision_from(known_artifacts, metadata) == 'id1' - metadata = {'url': 'url3'} + metadata = {'url': 'url3', 'integrity': 'integrity3'} assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res['status'] == 'eventful' expected_branches = { 'https://github.com/owner-1/repository-1/revision-1.tgz': { 'target': '488ad4e7b8e2511258725063cf43a2b897c503b4', 'target_type': 'revision', }, 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision', }, } expected_snapshot = { 'id': '0c5881c74283793ebe9a09a105a9381e41380383', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) def test_eoferror(swh_config, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa loader = NixGuixLoader(sources) loader.load() expected_branches = { 'evaluation': { 'target': 'cc4e04c26672dd74e5fd0fecb78b435fb55368f7', 'target_type': 'revision', }, } expected_snapshot = { 'id': '4257fa2350168c6bfec726a06452ea27a2c0cb33', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage)