Changeset View
Standalone View
swh/loader/package/nixguix/loader.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
import json | import json | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple | from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple | ||||
import attr | import attr | ||||
from swh.loader.package.loader import ( | from swh.loader.package.loader import ( | ||||
BasePackageInfo, | BasePackageInfo, | ||||
PackageLoader, | PackageLoader, | ||||
RawExtrinsicMetadataCore, | RawExtrinsicMetadataCore, | ||||
) | ) | ||||
from swh.loader.package.utils import EMPTY_AUTHOR, api_info | from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.collections import ImmutableDict | from swh.model.collections import ImmutableDict | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
MetadataAuthority, | MetadataAuthority, | ||||
MetadataAuthorityType, | MetadataAuthorityType, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
Sha1Git, | Sha1Git, | ||||
Show All 25 Lines | |||||
class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): | class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): | ||||
"""Load sources from a sources.json file. This loader is used to load | """Load sources from a sources.json file. This loader is used to load | ||||
sources used by functional package manager (eg. Nix and Guix). | sources used by functional package manager (eg. Nix and Guix). | ||||
""" | """ | ||||
visit_type = "nixguix" | visit_type = "nixguix" | ||||
def __init__(self, url): | def __init__(self, url): | ||||
super().__init__(url=url) | super().__init__(url=url) | ||||
unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) | |||||
self.raw_sources = retrieve_sources(url) | |||||
clean = clean_sources( | |||||
parse_sources(self.raw_sources), unsupported_file_extensions | |||||
) | |||||
self.sources = clean["sources"] | |||||
self.provider_url = url | self.provider_url = url | ||||
self._integrityByUrl = {s["urls"][0]: s["integrity"] for s in self.sources} | |||||
# The revision used to create the sources.json file. For Nix, | |||||
# this revision belongs to the github.com/nixos/nixpkgs | |||||
# repository | |||||
self.revision = clean["revision"] | |||||
# Note: this could be renamed get_artifacts in the PackageLoader | # Note: this could be renamed get_artifacts in the PackageLoader | ||||
# base class. | # base class. | ||||
def get_versions(self): | @cached_method | ||||
def raw_sources(self): | |||||
return retrieve_sources(self.url) | |||||
@cached_method | |||||
def supported_sources(self): | |||||
raw_sources = self.raw_sources() | |||||
unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) | |||||
return clean_sources(parse_sources(raw_sources), unsupported_file_extensions) | |||||
@cached_method | |||||
def integrity_by_url(self) -> Dict[str, Any]: | |||||
sources = self.supported_sources() | |||||
return {s["urls"][0]: s["integrity"] for s in sources["sources"]} | |||||
def get_versions(self) -> List[str]: | |||||
"""The first mirror of the mirror list is used as branch name in the | """The first mirror of the mirror list is used as branch name in the | ||||
vlorentz: why this change? | |||||
Done Inline Actions(I'll take the question on the overall block change) (The said change is explained in the diff description, but here goes with more details) First, side-effect in constructor (http interaction here) is not a good idea. It complexifies maintenance, readability, etc... Second, in the current state of affairs, if any failure happens, nothing is recorded in the archive. Now with this, the http interaction is delayed after the constructor so it is now recorded in the archive. And third, it's now unified with how npm and the pypi loader does it. ardumont: (I'll take the question on the overall block change)
(The said change is explained in the diff… | |||||
snapshot. | snapshot. | ||||
""" | """ | ||||
return self._integrityByUrl.keys() | return list(self.integrity_by_url().keys()) | ||||
def get_metadata_authority(self): | def get_metadata_authority(self): | ||||
return MetadataAuthority( | return MetadataAuthority( | ||||
type=MetadataAuthorityType.FORGE, url=self.url, metadata={}, | type=MetadataAuthorityType.FORGE, url=self.url, metadata={}, | ||||
) | ) | ||||
def get_extrinsic_snapshot_metadata(self): | def get_extrinsic_snapshot_metadata(self): | ||||
return [ | return [ | ||||
RawExtrinsicMetadataCore( | RawExtrinsicMetadataCore( | ||||
format="nixguix-sources-json", metadata=self.raw_sources, | format="nixguix-sources-json", metadata=self.raw_sources(), | ||||
), | ), | ||||
] | ] | ||||
# Note: this could be renamed get_artifact_info in the PackageLoader | # Note: this could be renamed get_artifact_info in the PackageLoader | ||||
# base class. | # base class. | ||||
def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: | def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: | ||||
# TODO: try all mirrors and not only the first one. A source | # TODO: try all mirrors and not only the first one. A source | ||||
# can be fetched from several urls, called mirrors. We | # can be fetched from several urls, called mirrors. We | ||||
# currently only use the first one, but if the first one | # currently only use the first one, but if the first one | ||||
# fails, we should try the second one and so on. | # fails, we should try the second one and so on. | ||||
integrity = self._integrityByUrl[url] | integrity = self.integrity_by_url()[url] | ||||
p_info = NixGuixPackageInfo.from_metadata({"url": url, "integrity": integrity}) | p_info = NixGuixPackageInfo.from_metadata({"url": url, "integrity": integrity}) | ||||
yield url, p_info | yield url, p_info | ||||
def known_artifacts( | def known_artifacts( | ||||
self, snapshot: Optional[Snapshot] | self, snapshot: Optional[Snapshot] | ||||
) -> Dict[Sha1Git, Optional[ImmutableDict[str, object]]]: | ) -> Dict[Sha1Git, Optional[ImmutableDict[str, object]]]: | ||||
"""Almost same implementation as the default one except it filters out the extra | """Almost same implementation as the default one except it filters out the extra | ||||
"evaluation" branch which does not have the right metadata structure. | "evaluation" branch which does not have the right metadata structure. | ||||
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines | def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: | ||||
pointer can target a nonexistent revision for a time. However, | pointer can target a nonexistent revision for a time. However, | ||||
the github and gnu loaders are supposed to load this revision | the github and gnu loaders are supposed to load this revision | ||||
and should create the revision pointed by this branch. | and should create the revision pointed by this branch. | ||||
This branch can be used to identify the snapshot associated to | This branch can be used to identify the snapshot associated to | ||||
a Nix/Guix evaluation. | a Nix/Guix evaluation. | ||||
""" | """ | ||||
# The revision used to create the sources.json file. For Nix, | |||||
# this revision belongs to the github.com/nixos/nixpkgs | |||||
# repository | |||||
revision = self.supported_sources()["revision"] | |||||
return { | return { | ||||
b"evaluation": { | b"evaluation": { | ||||
"target_type": "revision", | "target_type": "revision", | ||||
"target": hashutil.hash_to_bytes(self.revision), | "target": hashutil.hash_to_bytes(revision), | ||||
} | } | ||||
} | } | ||||
def build_revision( | def build_revision( | ||||
self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Revision]: | ) -> Optional[Revision]: | ||||
return Revision( | return Revision( | ||||
type=RevisionType.TAR, | type=RevisionType.TAR, | ||||
Show All 11 Lines | ) -> Optional[Revision]: | ||||
"when": self.visit_date.isoformat(), | "when": self.visit_date.isoformat(), | ||||
"raw": p_info.raw_info, | "raw": p_info.raw_info, | ||||
}, | }, | ||||
}, | }, | ||||
) | ) | ||||
def retrieve_sources(url: str) -> bytes: | def retrieve_sources(url: str) -> bytes: | ||||
"""Retrieve sources. Potentially raise NotFound error.""" | |||||
return api_info(url, allow_redirects=True) | return api_info(url, allow_redirects=True) | ||||
Done Inline Actionsagain, why just ValueError? vlorentz: again, why just ValueError? | |||||
Done Inline Actionsno idea, because that's what's raised currently? What do you imply? is Exception enough? ardumont: no idea, because that's what's raised currently?
What do you imply? is Exception enough? | |||||
Done Inline Actionscontext: api_info raises only ValueError. ardumont: context: `api_info` raises only ValueError. | |||||
Done Inline Actionsyes, and i did that (following one my own comment below). ardumont: yes, and i did that (following one my own comment below). | |||||
def parse_sources(raw_sources: bytes) -> Dict[str, Any]: | def parse_sources(raw_sources: bytes) -> Dict[str, Any]: | ||||
return json.loads(raw_sources.decode("utf-8")) | return json.loads(raw_sources.decode("utf-8")) | ||||
def make_pattern_unsupported_file_extension(unsupported_file_extensions: List[str],): | def make_pattern_unsupported_file_extension(unsupported_file_extensions: List[str],): | ||||
"""Make a regexp pattern for unsupported file extension out of a list | """Make a regexp pattern for unsupported file extension out of a list | ||||
of unsupported archive extension list. | of unsupported archive extension list. | ||||
▲ Show 20 Lines • Show All 98 Lines • Show Last 20 Lines |
why this change?