diff --git a/requirements-swh.txt b/requirements-swh.txt index 836cc6f..c65491f 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.75 -swh.model >= 0.0.60 +swh.model >= 0.3.0 swh.scheduler swh.storage >= 0.0.189 diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py index 9c3ea48..d548a4e 100644 --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,143 +1,143 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import iso8601 import logging from os import path from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import release_name, artifact_identity from swh.model.model import ( Sha1Git, Person, TimestampWithTimezone, Revision, RevisionType, ) logger = logging.getLogger(__name__) SWH_PERSON = Person( name=b"Software Heritage", fullname=b"Software Heritage", email=b"robot@softwareheritage.org", ) REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" class ArchiveLoader(PackageLoader): """Load archive origin's artifact files into swh archive """ visit_type = "tar" def __init__( self, url: str, artifacts: Sequence[Mapping[str, Any]], identity_artifact_keys: Optional[Sequence[str]] = None, ): """Loader constructor. For now, this is the lister's task output. Args: url: Origin url artifacts: List of artifact information with keys: - **time**: last modification time as either isoformat date string or timestamp - **url**: the artifact url to retrieve filename - **artifact's filename version**: artifact's version length - **length**: artifact's length identity_artifact_keys: Optional List of keys forming the "identity" of an artifact """ super().__init__(url=url) self.artifacts = artifacts # assume order is enforced in the lister if not identity_artifact_keys: # default keys for gnu identity_artifact_keys = ["time", "url", "length", "version"] self.identity_artifact_keys = identity_artifact_keys def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get("version") if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]["version"] def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: for a_metadata in self.artifacts: url = a_metadata["url"] package_version = a_metadata["version"] if version == package_version: filename = a_metadata.get("filename") p_info = { "url": url, "filename": filename if filename else path.split(url)[-1], "raw": a_metadata, } # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: identity = artifact_identity( artifact_metadata, id_keys=self.identity_artifact_keys ) for rev_id, known_artifact in known_artifacts.items(): logging.debug("known_artifact: %s", known_artifact) reference_artifact = known_artifact["extrinsic"]["raw"] known_identity = artifact_identity( reference_artifact, id_keys=self.identity_artifact_keys ) if identity == known_identity: return rev_id return None def build_revision( self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: time = a_metadata["time"] # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date time = iso8601.parse_date(time) normalized_time = TimestampWithTimezone.from_datetime(time) return Revision( type=RevisionType.TAR, message=REVISION_MESSAGE, date=normalized_time, author=SWH_PERSON, committer=SWH_PERSON, committer_date=normalized_time, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {}, "extrinsic": { "provider": self.url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py index c3c29e0..0ec027c 100644 --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -1,194 +1,194 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser import datetime import os import logging import re from datetime import timezone from os import path from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple from debian.deb822 import Deb822 from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import release_name, artifact_identity from swh.model.model import ( Person, TimestampWithTimezone, Sha1Git, Revision, RevisionType, ) logger = logging.getLogger(__name__) DATE_PATTERN = re.compile(r"^(?P\d{4})-(?P\d{2})$") class CRANLoader(PackageLoader): visit_type = "cran" def __init__(self, url: str, artifacts: List[Dict]): """Loader constructor. Args: url: Origin url to retrieve cran artifact(s) from artifacts: List of associated artifact for the origin url """ super().__init__(url=url) # explicit what we consider the artifact identity self.id_keys = ["url", "version"] self.artifacts = artifacts def get_versions(self) -> List[str]: versions = [] for artifact in self.artifacts: versions.append(artifact["version"]) return versions def get_default_version(self) -> str: return self.artifacts[-1]["version"] def get_package_info( self, version: str ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: for a_metadata in self.artifacts: url = a_metadata["url"] package_version = a_metadata["version"] if version == package_version: p_info = { "url": url, "filename": path.basename(url), "raw": a_metadata, } yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Mapping[bytes, Mapping], artifact_metadata: Mapping[str, Any], ) -> Optional[bytes]: """Given known_artifacts per revision, try to determine the revision for artifact_metadata """ new_identity = artifact_identity(artifact_metadata, self.id_keys) for rev_id, known_artifact_meta in known_artifacts.items(): logging.debug("known_artifact_meta: %s", known_artifact_meta) known_artifact = known_artifact_meta["extrinsic"]["raw"] known_identity = artifact_identity(known_artifact, self.id_keys) if new_identity == known_identity: return rev_id return None def build_revision( self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) date = parse_date(metadata.get("Date")) author = Person.from_fullname(metadata.get("Maintainer", "").encode()) version = metadata.get("Version", a_metadata["version"]) return Revision( message=version.encode("utf-8"), type=RevisionType.TAR, date=date, author=author, committer=author, committer_date=date, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "DESCRIPTION", "raw": metadata,}, "extrinsic": { "provider": self.url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) def parse_debian_control(filepath: str) -> Dict[str, Any]: """Parse debian control at filepath""" metadata: Dict = {} logger.debug("Debian control file %s", filepath) for paragraph in Deb822.iter_paragraphs(open(filepath, "rb")): logger.debug("paragraph: %s", paragraph) metadata.update(**paragraph) logger.debug("metadata parsed: %s", metadata) return metadata def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]: """Given an uncompressed path holding the DESCRIPTION file, returns a DESCRIPTION parsed structure as a dict. Cran origins describes their intrinsic metadata within a DESCRIPTION file at the root tree of a tarball. This DESCRIPTION uses a simple file format called DCF, the Debian control format. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the DESCRIPTION parsed structure as a dict (or empty dict if missing) """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] description_path = os.path.join(dir_path, project_dirname, "DESCRIPTION") if not os.path.exists(description_path): return {} return parse_debian_control(description_path) def parse_date(date: Optional[str]) -> Optional[TimestampWithTimezone]: """Parse a date into a datetime """ assert not date or isinstance(date, str) dt: Optional[datetime.datetime] = None if not date: return None try: specific_date = DATE_PATTERN.match(date) if specific_date: year = int(specific_date.group("year")) month = int(specific_date.group("month")) dt = datetime.datetime(year, month, 1) else: dt = dateutil.parser.parse(date) if not dt.tzinfo: # up for discussion the timezone needs to be set or # normalize_timestamp is not happy: ValueError: normalize_timestamp # received datetime without timezone: 2001-06-08 00:00:00 dt = dt.replace(tzinfo=timezone.utc) except Exception as e: logger.warning("Fail to parse date %s. Reason: %s", (date, e)) if dt: return TimestampWithTimezone.from_datetime(dt) else: return None diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py index fc1e2e4..4793c6a 100644 --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -1,418 +1,418 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import email.utils import logging from os import path import re import subprocess from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc from typing import Any, Generator, List, Mapping, Optional, Sequence, Tuple from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download, release_name from swh.model.model import ( Sha1Git, Person, Revision, RevisionType, TimestampWithTimezone, ) logger = logging.getLogger(__name__) UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") class DebianLoader(PackageLoader): """Load debian origins into swh archive. """ visit_type = "deb" def __init__(self, url: str, date: str, packages: Mapping[str, Any]): """Debian Loader implementation. Args: url: Origin url (e.g. deb://Debian/packages/cicero) date: Ignored packages: versioned packages and associated artifacts, example:: { 'stretch/contrib/0.7.2-3': { 'name': 'cicero', 'version': '0.7.2-3' 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f...', 'size': 3964, 'uri': 'http://d.d.o/cicero_0.7.2-3.diff.gz', }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefb...', 'size': 1864, 'uri': 'http://d.d.o/cicero_0.7.2-3.dsc', }, 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbab...', 'size': 96527, 'uri': 'http://d.d.o/cicero_0.7.2.orig.tar.gz', } }, }, # ... } """ super().__init__(url=url) self.packages = packages def get_versions(self) -> Sequence[str]: """Returns the keys of the packages input (e.g. stretch/contrib/0.7.2-3, etc...) """ return list(self.packages.keys()) def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: meta = self.packages[version] p_info = meta.copy() p_info["raw"] = meta yield release_name(version), p_info def resolve_revision_from( self, known_package_artifacts: Mapping, artifact_metadata: Mapping ) -> Optional[bytes]: return resolve_revision_from(known_package_artifacts, artifact_metadata) def download_package( self, p_info: Mapping[str, Any], tmpdir: str ) -> List[Tuple[str, Mapping]]: """Contrary to other package loaders (1 package, 1 artifact), `a_metadata` represents the package's datafiles set to fetch: - .orig.tar.gz - .dsc - .diff.gz This is delegated to the `download_package` function. """ all_hashes = download_package(p_info, tmpdir) logger.debug("all_hashes: %s", all_hashes) res = [] for hashes in all_hashes.values(): res.append((tmpdir, hashes)) logger.debug("res: %s", res) return res def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: logger.debug("dl_artifacts: %s", dl_artifacts) return extract_package(dl_artifacts, dest=dest) def build_revision( self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: dsc_url, dsc_name = dsc_information(a_metadata) if not dsc_name: raise ValueError("dsc name for url %s should not be None" % dsc_url) dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) i_metadata = get_package_metadata(a_metadata, dsc_path, uncompressed_path) logger.debug("i_metadata: %s", i_metadata) logger.debug("a_metadata: %s", a_metadata) msg = "Synthetic revision for Debian source package %s version %s" % ( a_metadata["name"], a_metadata["version"], ) date = TimestampWithTimezone.from_iso8601(i_metadata["changelog"]["date"]) author = prepare_person(i_metadata["changelog"]["person"]) # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa return Revision( type=RevisionType.DSC, message=msg.encode("utf-8"), author=author, date=date, committer=author, committer_date=date, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "dsc", "raw": i_metadata,}, "extrinsic": { "provider": dsc_url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) def resolve_revision_from( known_package_artifacts: Mapping, artifact_metadata: Mapping ) -> Optional[bytes]: """Given known package artifacts (resolved from the snapshot of previous visit) and the new artifact to fetch, try to solve the corresponding revision. """ artifacts_to_fetch = artifact_metadata.get("files") if not artifacts_to_fetch: return None def to_set(data): return frozenset( [ (name, meta["sha256"], meta["size"]) for name, meta in data["files"].items() ] ) # what we want to avoid downloading back if we have them already set_new_artifacts = to_set(artifact_metadata) known_artifacts_revision_id = {} for rev_id, known_artifacts in known_package_artifacts.items(): extrinsic = known_artifacts.get("extrinsic") if not extrinsic: continue s = to_set(extrinsic["raw"]) known_artifacts_revision_id[s] = rev_id return known_artifacts_revision_id.get(set_new_artifacts) def uid_to_person(uid: str) -> Mapping[str, str]: """Convert an uid to a person suitable for insertion. Args: uid: an uid of the form "Name " Returns: a dictionary with the following keys: - name: the name associated to the uid - email: the mail associated to the uid - fullname: the actual uid input """ logger.debug("uid: %s", uid) ret = { "name": "", "email": "", "fullname": uid, } name, mail = email.utils.parseaddr(uid) if name and email: ret["name"] = name ret["email"] = mail else: ret["name"] = uid return ret def prepare_person(person: Mapping[str, str]) -> Person: """Prepare person for swh serialization... Args: A person dict Returns: A person ready for storage """ return Person.from_dict( {key: value.encode("utf-8") for (key, value) in person.items()} ) def download_package(package: Mapping[str, Any], tmpdir: Any) -> Mapping[str, Any]: """Fetch a source package in a temporary directory and check the checksums for all files. Args: package: Dict defining the set of files representing a debian package tmpdir: Where to download and extract the files to ingest Returns: Dict of swh hashes per filename key """ all_hashes = {} for filename, fileinfo in package["files"].items(): uri = fileinfo["uri"] logger.debug("fileinfo: %s", fileinfo) extrinsic_hashes = {"sha256": fileinfo["sha256"]} logger.debug("extrinsic_hashes(%s): %s", filename, extrinsic_hashes) filepath, hashes = download( uri, dest=tmpdir, filename=filename, hashes=extrinsic_hashes ) all_hashes[filename] = hashes logger.debug("all_hashes: %s", all_hashes) return all_hashes def dsc_information(package: Mapping[str, Any]) -> Tuple[Optional[str], Optional[str]]: """Retrieve dsc information from a package. Args: package: Package metadata information Returns: Tuple of dsc file's uri, dsc's full disk path """ dsc_name = None dsc_url = None for filename, fileinfo in package["files"].items(): if filename.endswith(".dsc"): if dsc_name: raise ValueError( "Package %s_%s references several dsc files." % (package["name"], package["version"]) ) dsc_url = fileinfo["uri"] dsc_name = filename return dsc_url, dsc_name def extract_package(dl_artifacts: List[Tuple[str, Mapping]], dest: str) -> str: """Extract a Debian source package to a given directory. Note that after extraction the target directory will be the root of the extracted package, rather than containing it. Args: package: package information dictionary dest: directory where the package files are stored Returns: Package extraction directory """ a_path = dl_artifacts[0][0] logger.debug("dl_artifacts: %s", dl_artifacts) for _, hashes in dl_artifacts: logger.debug("hashes: %s", hashes) filename = hashes["filename"] if filename.endswith(".dsc"): dsc_name = filename break dsc_path = path.join(a_path, dsc_name) destdir = path.join(dest, "extracted") logfile = path.join(dest, "extract.log") logger.debug( "extract Debian source package %s in %s" % (dsc_path, destdir), extra={"swh_type": "deb_extract", "swh_dsc": dsc_path, "swh_destdir": destdir,}, ) cmd = [ "dpkg-source", "--no-copy", "--no-check", "--ignore-bad-version", "-x", dsc_path, destdir, ] try: with open(logfile, "w") as stdout: subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logdata = open(logfile, "r").read() raise ValueError( "dpkg-source exited with code %s: %s" % (e.returncode, logdata) ) from None return destdir def get_package_metadata( package: Mapping[str, Any], dsc_path: str, extracted_path: str ) -> Mapping[str, Any]: """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: package: the package dict (with a dsc_path key) dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted Returns: dict: a dictionary with the following keys: - history: list of (package_name, package_version) tuples parsed from the package changelog """ with open(dsc_path, "rb") as dsc: parsed_dsc = Dsc(dsc) # Parse the changelog to retrieve the rest of the package information changelog_path = path.join(extracted_path, "debian/changelog") with open(changelog_path, "rb") as changelog: try: parsed_changelog = Changelog(changelog) except UnicodeDecodeError: logger.warning( "Unknown encoding for changelog %s," " falling back to iso" % changelog_path, extra={ "swh_type": "deb_changelog_encoding", "swh_name": package["name"], "swh_version": str(package["version"]), "swh_changelog": changelog_path, }, ) # need to reset as Changelog scrolls to the end of the file changelog.seek(0) parsed_changelog = Changelog(changelog, encoding="iso-8859-15") package_info = { "name": package["name"], "version": str(package["version"]), "changelog": { "person": uid_to_person(parsed_changelog.author), "date": parse_date(parsed_changelog.date).isoformat(), "history": [ (block.package, str(block.version)) for block in parsed_changelog ][1:], }, } maintainers = [ uid_to_person(parsed_dsc["Maintainer"]), ] maintainers.extend( uid_to_person(person) for person in UPLOADERS_SPLIT.split(parsed_dsc.get("Uploaders", "")) ) package_info["maintainers"] = maintainers return package_info diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index 3d65462..22f0f2d 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,281 +1,281 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import requests import types from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import ( Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git, ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download logger = logging.getLogger(__name__) class DepositLoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = "deposit" def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) config_deposit = self.config["deposit"] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) self.metadata: Dict[str, Any] = {} def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ["HEAD"] def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: p_info = { "filename": "archive.zip", "raw": self.metadata, } yield "HEAD", p_info def download_package( self, p_info: Mapping[str, Any], tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get(self.deposit_id, tmpdir, p_info["filename"])] def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: depo = a_metadata.pop("deposit") # Note: # `date` and `committer_date` are always transmitted by the deposit read api # which computes itself the values. The loader needs to use those to create the # revision. # date: codemeta:dateCreated if any, deposit completed_date otherwise date = TimestampWithTimezone.from_dict(depo["author_date"]) # commit_date: codemeta:datePublished if any, deposit completed_date otherwise commit_date = TimestampWithTimezone.from_dict(depo["committer_date"]) client, id, collection = [depo[k] for k in ["client", "id", "collection"]] message = f"{client}: Deposit {id} in collection {collection}".encode("utf-8") author = parse_author(depo["author"]) committer = parse_author(depo["committer"]) return Revision( type=RevisionType.TAR, message=message, author=author, date=date, committer=committer, committer_date=commit_date, - parents=[hash_to_bytes(p) for p in depo["revision_parents"]], + parents=tuple([hash_to_bytes(p) for p in depo["revision_parents"]]), directory=directory, synthetic=True, metadata={ "extrinsic": { "provider": self.client.metadata_url(self.deposit_id), "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: self.metadata = self.client.metadata_get(self.deposit_id) except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} # Then usual loading r = super().load() success = r["status"] != "failed" if success: # Update archive with metadata information origin_metadata = self.metadata["origin_metadata"] logger.debug("origin_metadata: %s", origin_metadata) provider = origin_metadata["provider"] authority = { "type": provider["provider_type"], "url": provider["provider_url"], "metadata": { "name": provider["provider_name"], **(provider["metadata"] or {}), }, } self.storage.metadata_authority_add(**authority) tool = origin_metadata["tool"] fetcher = { "name": tool["name"], "version": tool["version"], "metadata": tool["configuration"], } self.storage.metadata_fetcher_add(**fetcher) metadata = origin_metadata["metadata"] format = "sword-v2-atom-codemeta-v2-in-json" self.storage.origin_metadata_add( self.url, self.visit_date, {"type": authority["type"], "url": authority["url"]}, {"name": fetcher["name"], "version": fetcher["version"]}, format, json.dumps(metadata).encode(), ) # Update deposit status try: if not success: self.client.status_update(self.deposit_id, status="failed") return r snapshot_id = hash_to_bytes(r["snapshot_id"]) branches = self.storage.snapshot_get(snapshot_id)["branches"] logger.debug("branches: %s", branches) if not branches: return r rev_id = branches[b"HEAD"]["target"] revisions = self.storage.revision_get([rev_id]) # FIXME: inconsistency between tests and production code if isinstance(revisions, types.GeneratorType): revisions = list(revisions) revision = revisions[0] # Retrieve the revision identifier dir_id = revision["directory"] # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_id, status="done", revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), snapshot_id=r["snapshot_id"], origin_url=self.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") return {"status": "failed"} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip("/") self.auth = None if not auth else (auth["username"], auth["password"]) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs["auth"] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str ) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f"{self.base_url}/{deposit_id}/raw/" return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f"{self.base_url}/{deposit_id}/meta/" def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do("get", url) if r.ok: return r.json() msg = f"Problem when retrieving deposit metadata at {url}" logger.error(msg) raise ValueError(msg) def status_update( self, deposit_id: Union[int, str], status: str, revision_id: Optional[str] = None, directory_id: Optional[str] = None, snapshot_id: Optional[str] = None, origin_url: Optional[str] = None, ): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f"{self.base_url}/{deposit_id}/update/" payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if snapshot_id: payload["snapshot_id"] = snapshot_id if origin_url: payload["origin_url"] = origin_url self.do("put", url, json=payload) diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py index 2c3e9f4..1e0f484 100644 --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,232 +1,232 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import requests from typing import Dict, Optional, Any, Mapping from swh.model import hashutil from swh.model.model import ( Revision, RevisionType, TargetType, Snapshot, BaseModel, Sha1Git, ) from swh.loader.package.utils import EMPTY_AUTHOR from swh.loader.package.loader import PackageLoader logger = logging.getLogger(__name__) class NixGuixLoader(PackageLoader): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = "nixguix" def __init__(self, url): super().__init__(url=url) raw = retrieve_sources(url) clean = clean_sources(raw) self.sources = clean["sources"] self.provider_url = url self._integrityByUrl = {s["urls"][0]: s["integrity"] for s in self.sources} # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository self.revision = clean["revision"] # Note: this could be renamed get_artifacts in the PackageLoader # base class. def get_versions(self): """The first mirror of the mirror list is used as branch name in the snapshot. """ return self._integrityByUrl.keys() # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, url): # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self._integrityByUrl[url] yield url, {"url": url, "raw": {"url": url, "integrity": integrity}} def known_artifacts(self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: """Almost same implementation as the default one except it filters out the extra "evaluation" branch which does not have the right metadata structure. """ if not snapshot: return {} # Skip evaluation revision which has no metadata revs = [ rev.target for branch_name, rev in snapshot.branches.items() if ( rev and rev.target_type == TargetType.REVISION and branch_name != b"evaluation" ) ] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: # revision_get can return None continue ret[revision["id"]] = revision["metadata"] return ret def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: for rev_id, known_artifact in known_artifacts.items(): try: known_integrity = known_artifact["extrinsic"]["raw"]["integrity"] except KeyError as e: logger.exception( "Unexpected metadata revision structure detected: %(context)s", { "context": { "revision": hashutil.hash_to_hex(rev_id), "reason": str(e), "known_artifact": known_artifact, } }, ) # metadata field for the revision is not as expected by the loader # nixguix. We consider this not the right revision and continue checking # the other revisions continue else: if artifact_metadata["integrity"] == known_integrity: return rev_id return None def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision is specified in the sources.json file itself. For the nixpkgs origin, this revision is coming from the github.com/nixos/nixpkgs repository. Note this repository is not loaded explicitly. So, this pointer can target a nonexistent revision for a time. However, the github and gnu loaders are supposed to load this revision and should create the revision pointed by this branch. This branch can be used to identify the snapshot associated to a Nix/Guix evaluation. """ return { b"evaluation": { "target_type": "revision", "target": hashutil.hash_to_bytes(self.revision), } } def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: return Revision( type=RevisionType.TAR, message=b"", author=EMPTY_AUTHOR, date=None, committer=EMPTY_AUTHOR, committer_date=None, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) def retrieve_sources(url: str) -> Dict[str, Any]: response = requests.get(url, allow_redirects=True) if response.status_code != 200: raise ValueError("Got %d HTTP code on %s", response.status_code, url) return json.loads(response.content.decode("utf-8")) def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: """Validate and clean the sources structure. First, it ensures all top level keys are presents. Then, it walks on the sources list and removes sources that don't contain required keys. Raises: ValueError: if a top level key is missing """ # Required top level keys required_keys = ["version", "revision", "sources"] missing_keys = [] for required_key in required_keys: if required_key not in sources: missing_keys.append(required_key) if missing_keys != []: raise ValueError( "sources structure invalid, missing: %s", ",".join(missing_keys) ) # Only the version 1 is currently supported if sources["version"] != 1: raise ValueError( "The sources structure version '%d' is not supported", sources["version"] ) # If a source doesn't contain required attributes, this source is # skipped but others could still be archived. verified_sources = [] for source in sources["sources"]: valid = True required_keys = ["urls", "integrity", "type"] for required_key in required_keys: if required_key not in source: logger.info( "Skip source '%s' because key '%s' is missing", source, required_key ) valid = False if source["type"] != "url": logger.info( "Skip source '%s' because the type %s is not supported", source, source["type"], ) valid = False if not isinstance(source["urls"], list): logger.info( "Skip source '%s' because the urls attribute is not a list", source ) valid = False if valid: verified_sources.append(source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index a9b6d48..b6eb5e0 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,287 +1,287 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from codecs import BOM_UTF8 from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional import attr import chardet from urllib.parse import quote from swh.model.model import ( Person, RevisionType, Revision, TimestampWithTimezone, Sha1Git, ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import api_info, release_name logger = logging.getLogger(__name__) class NpmLoader(PackageLoader): """Load npm origin's artifact releases into swh archive. """ visit_type = "npm" def __init__(self, url: str): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(url=url) package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @property def info(self) -> Dict[str, Any]: """Return the project metadata information (fetched from npm registry) """ if not self._info: self._info = api_info(self.provider_url) return self._info def get_versions(self) -> Sequence[str]: return sorted(list(self.info["versions"].keys())) def get_default_version(self) -> str: return self.info["dist-tags"].get("latest", "") def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: meta = self.info["versions"][version] url = meta["dist"]["tarball"] p_info = { "url": url, "filename": os.path.basename(url), "raw": meta, } yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata author = extract_npm_package_author(i_metadata) message = i_metadata["version"].encode("ascii") # from extrinsic metadata # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = a_metadata["version"] if "time" in self.info: date = self.info["time"][extrinsic_version] elif "mtime" in a_metadata: date = a_metadata["mtime"] else: artifact_name = os.path.basename(a_metadata["dist"]["tarball"]) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (self.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Revision( type=RevisionType.TAR, message=message, author=author, date=date, committer=author, committer_date=date, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "package.json", "raw": i_metadata,}, "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) return r def artifact_to_revision_id( known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. The following code allows to deal with 2 metadata formats: - old format sample:: { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } - new format sample:: { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, }], ... } """ shasum = artifact_metadata["dist"]["shasum"] for rev_id, known_artifact in known_artifacts.items(): known_original_artifact = known_artifact.get("original_artifact") if not known_original_artifact: # previous loader-npm version kept original artifact elsewhere known_original_artifact = known_artifact.get("package_source") if not known_original_artifact: continue original_hash = known_original_artifact["sha1"] else: assert isinstance(known_original_artifact, list) original_hash = known_original_artifact[0]["checksums"]["sha1"] if shasum == original_hash: return rev_id return None def extract_npm_package_author(package_json) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json (dict): Dict holding the content of parsed ``package.json`` file Returns: Person """ def _author_str(author_data): if type(author_data) is dict: author_str = "" if "name" in author_data: author_str += author_data["name"] if "email" in author_data: author_str += " <%s>" % author_data["email"] return author_str elif type(author_data) is list: return _author_str(author_data[0]) if len(author_data) > 0 else "" else: return author_data for author_key in ("author", "authors"): if author_key in package_json: author_str = _author_str(package_json[author_key]) return Person.from_fullname(author_str.encode()) return Person(fullname=b"", name=None, email=None) def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py index 2a2e0d3..99d0487 100644 --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,250 +1,250 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from urllib.parse import urlparse from pkginfo import UnpackedSDist from swh.model.model import ( Person, Sha1Git, TimestampWithTimezone, Revision, RevisionType, ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import api_info, release_name, EMPTY_AUTHOR logger = logging.getLogger(__name__) class PyPILoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = "pypi" def __init__(self, url): super().__init__(url=url) self._info = None self.provider_url = pypi_api_url(self.url) @property def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ if not self._info: self._info = api_info(self.provider_url) return self._info def get_versions(self) -> Sequence[str]: return self.info["releases"].keys() def get_default_version(self) -> str: return self.info["info"]["version"] def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: res = [] for meta in self.info["releases"][version]: if meta["packagetype"] != "sdist": continue filename = meta["filename"] p_info = { "url": meta["url"], "filename": filename, "raw": meta, } res.append((version, p_info)) if len(res) == 1: version, p_info = res[0] yield release_name(version), p_info else: for version, p_info in res: yield release_name(version, p_info["filename"]), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata name = i_metadata["version"] _author = author(i_metadata) # from extrinsic metadata message = a_metadata.get("comment_text", "") message = "%s: %s" % (name, message) if message else name date = TimestampWithTimezone.from_iso8601(a_metadata["upload_time"]) return Revision( type=RevisionType.TAR, message=message.encode("utf-8"), author=_author, date=date, committer=_author, committer_date=date, - parents=[], + parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "PKG-INFO", "raw": i_metadata,}, "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), "raw": a_metadata, }, }, ) def artifact_to_revision_id( known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. The following code allows to deal with 2 metadata formats (column metadata in 'revision') - old format sample:: { 'original_artifact': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, ... } - new format sample:: { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, }], ... } """ sha256 = artifact_metadata["digests"]["sha256"] for rev_id, known_artifact in known_artifacts.items(): original_artifact = known_artifact["original_artifact"] if isinstance(original_artifact, dict): # previous loader-pypi version stored metadata as dict original_sha256 = original_artifact["sha256"] if sha256 == original_sha256: return rev_id continue # new pypi loader actually store metadata dict differently... assert isinstance(original_artifact, list) # current loader-pypi stores metadata as list of dict for original_artifact in known_artifact["original_artifact"]: if sha256 == original_artifact["checksums"]["sha256"]: return rev_id return None def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.rstrip("/").split("/")[-1] url = "%s://%s/pypi/%s/json" % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, "PKG-INFO") if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop("filename") # this gets added with the ondisk location return raw def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email)