diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py index c2b654e..b5ec9dc 100644 --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -1,475 +1,462 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import email.utils import logging from os import path import re import subprocess from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple import attr from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID from swh.loader.package.utils import download, release_name from swh.model.hashutil import hash_to_bytes from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") EXTID_TYPE = "dsc-sha256" EXTID_VERSION = 1 class DscCountError(ValueError): """Raised when an unexpected number of .dsc files is seen""" pass @attr.s class DebianFileMetadata: name = attr.ib(type=str) """Filename""" size = attr.ib(type=int) uri = attr.ib(type=str) """URL of this specific file""" # all checksums are not always available, make them optional sha256 = attr.ib(type=str, default="") md5sum = attr.ib(type=str, default="") sha1 = attr.ib(type=str, default="") # Some of the DSC files imported in swh apparently had a Checksums-SHA512 # field which got recorded in the archive. Current versions of dpkg-source # don't seem to generate them, but keep the field available for # future-proofing. sha512 = attr.ib(type=str, default="") @attr.s class DebianPackageChangelog: person = attr.ib(type=Dict[str, str]) """A dict with fields like, model.Person, except they are str instead of bytes, and 'email' is optional.""" date = attr.ib(type=str) """Date of the changelog entry.""" history = attr.ib(type=List[Tuple[str, str]]) """List of tuples (package_name, version)""" @attr.s class DebianPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) files = attr.ib(type=Dict[str, DebianFileMetadata]) """Metadata of the files (.deb, .dsc, ...) of the package.""" name = attr.ib(type=str) intrinsic_version = attr.ib(type=str) """eg. ``0.7.2-3``, while :attr:`version` would be ``stretch/contrib/0.7.2-3``""" @classmethod def from_metadata( cls, a_metadata: Dict[str, Any], url: str, version: str ) -> "DebianPackageInfo": intrinsic_version = a_metadata["version"] assert "/" in version and "/" not in intrinsic_version, ( version, intrinsic_version, ) return cls( url=url, filename=None, version=version, raw_info=a_metadata, files={ file_name: DebianFileMetadata(**file_metadata) for (file_name, file_metadata) in a_metadata.get("files", {}).items() }, name=a_metadata["name"], intrinsic_version=intrinsic_version, ) def extid(self) -> Optional[PartialExtID]: dsc_files = [ file for (name, file) in self.files.items() if name.endswith(".dsc") ] if len(dsc_files) != 1: raise DscCountError( f"Expected exactly one .dsc file for package {self.name}, " f"got {len(dsc_files)}" ) return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(dsc_files[0].sha256)) @attr.s class IntrinsicPackageMetadata: """Metadata extracted from a package's .dsc file.""" name = attr.ib(type=str) version = attr.ib(type=str) changelog = attr.ib(type=DebianPackageChangelog) maintainers = attr.ib(type=List[Dict[str, str]]) """A list of dicts with fields like, model.Person, except they are str instead of bytes, and 'email' is optional.""" class DebianLoader(PackageLoader[DebianPackageInfo]): """Load debian origins into swh archive. """ visit_type = "deb" def __init__( self, storage: StorageInterface, url: str, packages: Mapping[str, Any], max_content_size: Optional[int] = None, ): """Debian Loader implementation. Args: url: Origin url (e.g. deb://Debian/packages/cicero) date: Ignored packages: versioned packages and associated artifacts, example:: { 'stretch/contrib/0.7.2-3': { 'name': 'cicero', 'version': '0.7.2-3' 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f...', 'size': 3964, 'uri': 'http://d.d.o/cicero_0.7.2-3.diff.gz', }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefb...', 'size': 1864, 'uri': 'http://d.d.o/cicero_0.7.2-3.dsc', }, 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbab...', 'size': 96527, 'uri': 'http://d.d.o/cicero_0.7.2.orig.tar.gz', } }, }, # ... } """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.packages = packages def get_versions(self) -> Sequence[str]: """Returns the keys of the packages input (e.g. stretch/contrib/0.7.2-3, etc...) """ return list(self.packages.keys()) def get_package_info(self, version: str) -> Iterator[Tuple[str, DebianPackageInfo]]: meta = self.packages[version] p_info = DebianPackageInfo.from_metadata(meta, url=self.url, version=version) yield release_name(version), p_info def download_package( self, p_info: DebianPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Contrary to other package loaders (1 package, 1 artifact), `p_info.files` represents the package's datafiles set to fetch: - .orig.tar.gz - .dsc - .diff.gz This is delegated to the `download_package` function. """ all_hashes = download_package(p_info, tmpdir) logger.debug("all_hashes: %s", all_hashes) res = [] for hashes in all_hashes.values(): res.append((tmpdir, hashes)) logger.debug("res: %s", res) return res def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: logger.debug("dl_artifacts: %s", dl_artifacts) return extract_package(dl_artifacts, dest=dest) def build_release( self, p_info: DebianPackageInfo, uncompressed_path: str, directory: Sha1Git, ) -> Optional[Release]: dsc_url, dsc_name = dsc_information(p_info) if not dsc_name: raise ValueError("dsc name for url %s should not be None" % dsc_url) dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) intrinsic_metadata = get_intrinsic_package_metadata( p_info, dsc_path, uncompressed_path ) logger.debug("intrinsic_metadata: %s", intrinsic_metadata) logger.debug("p_info: %s", p_info) msg = ( f"Synthetic release for Debian source package {p_info.name} " f"version {p_info.intrinsic_version}\n" ) author = prepare_person(intrinsic_metadata.changelog.person) date = TimestampWithTimezone.from_iso8601(intrinsic_metadata.changelog.date) # inspired from swh.loader.debian.converters.package_metadata_to_revision return Release( name=p_info.intrinsic_version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def uid_to_person(uid: str) -> Dict[str, str]: """Convert an uid to a person suitable for insertion. Args: uid: an uid of the form "Name " Returns: a dictionary with the following keys: - name: the name associated to the uid - email: the mail associated to the uid - fullname: the actual uid input """ - logger.debug("uid: %s", uid) - ret = { - "name": "", - "email": "", - "fullname": uid, - } - - name, mail = email.utils.parseaddr(uid) - if name and email: - ret["name"] = name - ret["email"] = mail - else: - ret["name"] = uid - return ret + person = Person.from_fullname(uid.encode("utf-8")) + return {k: v.decode("utf-8") for k, v in person.to_dict().items() if v is not None} def prepare_person(person: Mapping[str, str]) -> Person: """Prepare person for swh serialization... Args: A person dict Returns: A person ready for storage """ return Person.from_dict( {key: value.encode("utf-8") for (key, value) in person.items()} ) def download_package(p_info: DebianPackageInfo, tmpdir: Any) -> Mapping[str, Any]: """Fetch a source package in a temporary directory and check the checksums for all files. Args: p_info: Information on a package tmpdir: Where to download and extract the files to ingest Returns: Dict of swh hashes per filename key """ all_hashes = {} for filename, fileinfo in p_info.files.items(): uri = fileinfo.uri logger.debug("fileinfo: %s", fileinfo) extrinsic_hashes = {"md5": fileinfo.md5sum} if fileinfo.sha256: extrinsic_hashes["sha256"] = fileinfo.sha256 if fileinfo.sha1: extrinsic_hashes["sha1"] = fileinfo.sha1 logger.debug("extrinsic_hashes(%s): %s", filename, extrinsic_hashes) _, hashes = download( uri, dest=tmpdir, filename=filename, hashes=extrinsic_hashes ) all_hashes[filename] = hashes logger.debug("all_hashes: %s", all_hashes) return all_hashes def dsc_information(p_info: DebianPackageInfo) -> Tuple[Optional[str], Optional[str]]: """Retrieve dsc information from a package. Args: p_info: Package metadata information Returns: Tuple of dsc file's uri, dsc's full disk path """ dsc_name = None dsc_url = None for filename, fileinfo in p_info.files.items(): if filename.endswith(".dsc"): if dsc_name: raise DscCountError( "Package %s_%s references several dsc files." % (p_info.name, p_info.intrinsic_version) ) dsc_url = fileinfo.uri dsc_name = filename return dsc_url, dsc_name def extract_package(dl_artifacts: List[Tuple[str, Mapping]], dest: str) -> str: """Extract a Debian source package to a given directory. Note that after extraction the target directory will be the root of the extracted package, rather than containing it. Args: package: package information dictionary dest: directory where the package files are stored Returns: Package extraction directory """ a_path = dl_artifacts[0][0] logger.debug("dl_artifacts: %s", dl_artifacts) for _, hashes in dl_artifacts: logger.debug("hashes: %s", hashes) filename = hashes["filename"] if filename.endswith(".dsc"): dsc_name = filename break dsc_path = path.join(a_path, dsc_name) destdir = path.join(dest, "extracted") logfile = path.join(dest, "extract.log") logger.debug( "extract Debian source package %s in %s" % (dsc_path, destdir), extra={"swh_type": "deb_extract", "swh_dsc": dsc_path, "swh_destdir": destdir,}, ) cmd = [ "dpkg-source", "--no-copy", "--no-check", "--ignore-bad-version", "-x", dsc_path, destdir, ] try: with open(logfile, "w") as stdout: subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logdata = open(logfile, "r").read() raise ValueError( "dpkg-source exited with code %s: %s" % (e.returncode, logdata) ) from None return destdir def get_intrinsic_package_metadata( p_info: DebianPackageInfo, dsc_path: str, extracted_path: str ) -> IntrinsicPackageMetadata: """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: p_info: the package information dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted Returns: dict: a dictionary with the following keys: - history: list of (package_name, package_version) tuples parsed from the package changelog """ with open(dsc_path, "rb") as dsc: parsed_dsc = Dsc(dsc) # Parse the changelog to retrieve the rest of the package information changelog_path = path.join(extracted_path, "debian/changelog") with open(changelog_path, "rb") as changelog_file: try: parsed_changelog = Changelog(changelog_file) except UnicodeDecodeError: logger.warning( "Unknown encoding for changelog %s," " falling back to iso" % changelog_path, extra={ "swh_type": "deb_changelog_encoding", "swh_name": p_info.name, "swh_version": str(p_info.version), "swh_changelog": changelog_path, }, ) # need to reset as Changelog scrolls to the end of the file changelog_file.seek(0) parsed_changelog = Changelog(changelog_file, encoding="iso-8859-15") history: List[Tuple[str, str]] = [] for block in parsed_changelog: assert block.package is not None history.append((block.package, str(block.version))) changelog = DebianPackageChangelog( person=uid_to_person(parsed_changelog.author), date=parse_date(parsed_changelog.date).isoformat(), history=history[1:], ) maintainers = [ uid_to_person(parsed_dsc["Maintainer"]), ] maintainers.extend( uid_to_person(person) for person in UPLOADERS_SPLIT.split(parsed_dsc.get("Uploaders", "")) ) return IntrinsicPackageMetadata( name=p_info.name, version=str(p_info.intrinsic_version), changelog=changelog, maintainers=maintainers, ) diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index 32b4db1..a302cbf 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,296 +1,296 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) -EMPTY_PERSON = Person(fullname=b"", name=None, email=None) +EMPTY_PERSON = Person.from_fullname(b"") EXTID_TYPE = "npm-archive-sha1" EXTID_VERSION = 0 @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) def extid(self) -> PartialExtID: return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.shasum)) class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. """ visit_type = "npm" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(self.package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) msg = ( f"Synthetic release for NPM source package {self.package_name} " f"version {p_info.version}\n" ) if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields """ if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index bcb9a47..d00b9a9 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,641 +1,641 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import os import pytest from swh.loader.package import __version__ from swh.loader.package.npm.loader import ( NpmLoader, _author_str, extract_npm_package_author, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult @pytest.fixture def org_api_info(datadir) -> bytes: with open(os.path.join(datadir, "https_replicate.npmjs.com", "org"), "rb",) as f: return f.read() def test_npm_author_str(): for author, expected_author in [ ("author", "author"), ( ["Al from quantum leap", "hal from 2001 space odyssey"], "Al from quantum leap", ), ([], ""), ({"name": "groot", "email": "groot@galaxy.org",}, "groot "), ({"name": "somebody",}, "somebody"), ({"email": "no@one.org"}, " "), # note first elt is an extra blank ({"name": "no one", "email": None,}, "no one"), ({"email": None,}, ""), ({"name": None}, ""), ({"name": None, "email": None,}, ""), ({}, ""), (None, None), ({"name": []}, "",), ( {"name": ["Susan McSween", "William H. Bonney", "Doc Scurlock",]}, "Susan McSween", ), (None, None), ]: assert _author_str(author) == expected_author def test_npm_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, "https_replicate.npmjs.com", "org_visit1" ) with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata["versions"]["0.0.2"]) == Person( fullname=b"mooz ", name=b"mooz", email=b"stillpedant@gmail.com", ) assert extract_npm_package_author(package_metadata["versions"]["0.0.3"]) == Person( fullname=b"Masafumi Oyamada ", name=b"Masafumi Oyamada", email=b"stillpedant@gmail.com", ) package_json = json.loads( """ { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Yauheni Pakala ", name=b"Yauheni Pakala", email=b"evgeniy.pakalo@gmail.com", ) package_json = json.loads( """ { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Shawn Walsh", name=b"Shawn Walsh", email=None ) package_json = json.loads( """ { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"fengmk2 (https://fengmk2.com)", name=b"fengmk2", email=b"fengmk2@gmail.com", ) package_json = json.loads( """ { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"xiaohuoni <448627663@qq.com>", name=b"xiaohuoni", email=b"448627663@qq.com", ) package_json_no_authors = json.loads( """{ "authors": null, "license": "MIT" }""" ) - assert extract_npm_package_author(package_json_no_authors) == Person( - fullname=b"", name=None, email=None + assert extract_npm_package_author(package_json_no_authors) == Person.from_fullname( + b"" ) def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes( [ "4ce3058e16ab3d7e077f65aabf855c34895bf17c", "858c3ceee84c8311adc808f8cdb30d233ddc9d18", "0fa33b4f5a4e0496da6843a38ff1af8b61541996", "85a410f8ef8eb8920f2c384a9555566ad4a2e21b", "9163ac8025923d5a45aaac482262893955c9b37b", "692cf623b8dd2c5df2c2998fd95ae4ec99882fb4", "18c03aac6d3e910efb20039c15d70ab5e0297101", "41265c42446aac17ca769e67d1704f99e5a1394d", "783ff33f5882813dca9239452c4a7cadd4dba778", "b029cfb85107aee4590c2434a3329bfcf36f8fa1", "112d1900b4c2e3e9351050d1b542c9744f9793f3", "5439bbc4bd9a996f1a38244e6892b71850bc98fd", "d83097a2f994b503185adf4e719d154123150159", "d0939b4898e83090ee55fd9d8a60e312cfadfbaf", "b3523a26f7147e4af40d9d462adaae6d49eda13e", "cd065fb435d6fb204a8871bcd623d0d0e673088c", "2854a40855ad839a54f4b08f5cff0cf52fca4399", "b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe", "0f73d56e1cf480bded8a1ecf20ec6fc53c574713", "0d9882b2dfafdce31f4e77fe307d41a44a74cefe", "585fc5caab9ead178a327d3660d35851db713df1", "e8cd41a48d79101977e3036a87aeb1aac730686f", "5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7", "9c3cc2763bf9e9e37067d3607302c4776502df98", "3649a68410e354c83cd4a38b66bd314de4c8f5c9", "e96ed0c091de1ebdf587104eaf63400d1974a1fe", "078ca03d2f99e4e6eab16f7b75fbb7afb699c86c", "38de737da99514de6559ff163c988198bc91367a", ] ) _expected_new_directories_first_visit = normalize_hashes( [ "3370d20d6f96dc1c9e50f083e2134881db110f4f", "42753c0c2ab00c4501b552ac4671c68f3cf5aece", "d7895533ef5edbcffdea3f057d9fef3a1ef845ce", "80579be563e2ef3e385226fe7a3f079b377f142c", "3b0ddc6a9e58b4b53c222da4e27b280b6cda591c", "bcad03ce58ac136f26f000990fc9064e559fe1c0", "5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca", "e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd", "584b5b4b6cf7f038095e820b99386a9c232de931", "184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a", "bb5f4ee143c970367eb409f2e4c1104898048b9d", "1b95491047add1103db0dfdfa84a9735dcb11e88", "a00c6de13471a2d66e64aca140ddb21ef5521e62", "5ce6c1cd5cda2d546db513aaad8c72a44c7771e2", "c337091e349b6ac10d38a49cdf8c2401ef9bb0f2", "202fafcd7c0f8230e89d5496ad7f44ab12b807bf", "775cc516543be86c15c1dc172f49c0d4e6e78235", "ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e", ] ) _expected_new_releases_first_visit = normalize_hashes( { "d38cc0b571cd41f3c85513864e049766b42032a7": ( "42753c0c2ab00c4501b552ac4671c68f3cf5aece" ), "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295": ( "3370d20d6f96dc1c9e50f083e2134881db110f4f" ), "6e976db82f6c310596b21fb0ed8b11f507631434": ( "d7895533ef5edbcffdea3f057d9fef3a1ef845ce" ), } ) def package_url(package): return "https://www.npmjs.com/package/%s" % package def package_metadata_url(package): return "https://replicate.npmjs.com/%s/" % package def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) release_id = "d38cc0b571cd41f3c85513864e049766b42032a7" versions = [ ("0.0.2", release_id), ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"), ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.4", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.2", message=b"Synthetic release for NPM source package org version 0.0.2\n", target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"mooz ", name=b"mooz", email=b"stillpedant@gmail.com", ), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == [] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, release_id) in versions: release = swh_storage.release_get([hash_to_bytes(release_id)])[0] assert release.target_type == ModelObjectType.DIRECTORY directory_id = release.target directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id, ) release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id), ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, ), discovery_date=loader.visit_date, format="replicate-npm-package-json", metadata=json.dumps( json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_npm_loader_incremental_visit(swh_storage, requests_mock_datadir_visits): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") actual_load_status = loader.load() assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # reset loader internal state del loader._cached_info del loader._cached__raw_info actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snap_id2 = actual_load_status2["snapshot_id"] assert snap_id2 is not None assert snap_id2 != actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="npm") stats = get_stats(swh_storage) assert { # 3 new releases artifacts "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 15, "origin": 1, "origin_visit": 2, "release": len(_expected_new_releases_first_visit) + 3, "revision": 0, "skipped_content": 0, "snapshot": 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://registry.npmjs.org") ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures("requests_mock_datadir") def test_npm_loader_version_divergence(swh_storage): package = "@aller_shared" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("ebbe6397d0c2a6cf7cba40fa5b043c59dd4f2497") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0" ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes("04c66f3a82aa001e8f1b45246b58b82d2b0ca0df"), ), b"releases/0.1.1-alpha.14": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes("90cc04dc72193f3b1444f10e1c525bee2ea9dac6"), ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { # 1 new releases artifacts "content": 534, "directory": 153, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = "nativescript-telerik-analytics" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-no-time" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-express" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("33b8f105d48ce16b6c59158af660e0cc78bcbef4") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # artifact is used expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.0.1" ), b"releases/0.0.1": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes("3e3b800570869fa9b3dbc302500553e62400cc06"), ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_no_artifact(swh_storage, requests_mock_datadir): """If no artifacts at all is found for origin, the visit fails completely """ package = "catify" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "failed", } assert_last_visit_matches(swh_storage, url, status="failed", type="npm") def test_npm_origin_not_found(swh_storage, requests_mock_datadir): url = package_url("non-existent-url") loader = NpmLoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="npm", snapshot=None ) diff --git a/swh/loader/package/opam/loader.py b/swh/loader/package/opam/loader.py index 0fe8967..d2a688f 100644 --- a/swh/loader/package/opam/loader.py +++ b/swh/loader/package/opam/loader.py @@ -1,259 +1,259 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import os from subprocess import PIPE, Popen, call from typing import Iterator, List, Optional, Tuple import attr from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import cached_method from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, ) from swh.storage.interface import StorageInterface @attr.s class OpamPackageInfo(BasePackageInfo): author = attr.ib(type=Person) committer = attr.ib(type=Person) def opam_read( cmd: List[str], init_error_msg_if_any: Optional[str] = None ) -> Optional[str]: """This executes an opam command and returns the first line of the output. Args: cmd: Opam command to execute as a list of string init_error_msg_if_any: Error message to raise in case a problem occurs during initialization Raises: ValueError with the init_error_msg_if_any content in case stdout is not consumable and the variable is provided with non empty value. Returns: the first line of the executed command output """ with Popen(cmd, stdout=PIPE) as proc: if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): # care only for the first line output result (mostly blank separated # values, callers will deal with the parsing of the line) return line elif init_error_msg_if_any: raise ValueError(init_error_msg_if_any) return None class OpamLoader(PackageLoader[OpamPackageInfo]): """Load all versions of a given package in a given opam repository. The state of the opam repository is stored in a directory called an opam root. This folder is a requisite for the opam binary to actually list information on package. When initialize_opam_root is False (the default for production workers), the opam root must already have been configured outside of the loading process. If not an error is raised, thus failing the loading. For standalone workers, initialize_opam_root must be set to True, so the ingestion can take care of installing the required opam root properly. The remaining ingestion uses the opam binary to give the versions of the given package. Then, for each version, the loader uses the opam binary to list the tarball url to fetch and ingest. """ visit_type = "opam" def __init__( self, storage: StorageInterface, url: str, opam_root: str, opam_instance: str, opam_url: str, opam_package: str, max_content_size: Optional[int] = None, initialize_opam_root: bool = False, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.opam_root = opam_root self.opam_instance = opam_instance self.opam_url = opam_url self.opam_package = opam_package self.initialize_opam_root = initialize_opam_root def get_package_dir(self) -> str: return ( f"{self.opam_root}/repo/{self.opam_instance}/packages/{self.opam_package}" ) def get_package_name(self, version: str) -> str: return f"{self.opam_package}.{version}" def get_package_file(self, version: str) -> str: return f"{self.get_package_dir()}/{self.get_package_name(version)}/opam" def get_metadata_authority(self): return MetadataAuthority(type=MetadataAuthorityType.FORGE, url=self.opam_url) @cached_method def _compute_versions(self) -> List[str]: """Compute the versions using opam internals Raises: ValueError in case the lister is not able to determine the list of versions Returns: The list of versions for the package """ # TODO: use `opam show` instead of this workaround when it support the `--repo` # flag package_dir = self.get_package_dir() if not os.path.exists(package_dir): raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})." ) versions = [ ".".join(version.split(".")[1:]) for version in os.listdir(package_dir) ] if not versions: raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})" ) versions.sort() return versions def get_versions(self) -> List[str]: """First initialize the opam root directory if needed then start listing the package versions. Raises: ValueError in case the lister is not able to determine the list of versions or if the opam root directory is invalid. """ if self.initialize_opam_root: # for standalone loader (e.g docker), loader must initialize the opam root # folder call( [ "opam", "init", "--reinit", "--bare", "--no-setup", "--root", self.opam_root, self.opam_instance, self.opam_url, ] ) else: # for standard/production loaders, no need to initialize the opam root # folder. It must be present though so check for it, if not present, raise if not os.path.isfile(os.path.join(self.opam_root, "config")): # so if not correctly setup, raise immediately raise ValueError("Invalid opam root") return self._compute_versions() def get_default_version(self) -> str: """Return the most recent version of the package as default.""" return self._compute_versions()[-1] def _opam_show_args(self, version: str): package_file = self.get_package_file(version) return [ "opam", "show", "--color", "never", "--safe", "--normalise", "--root", self.opam_root, "--file", package_file, ] def get_enclosed_single_line_field(self, field, version) -> Optional[str]: result = opam_read(self._opam_show_args(version) + ["--field", field]) # Sanitize the result if any (remove trailing \n and enclosing ") return result.strip().strip('"') if result else None def get_package_info(self, version: str) -> Iterator[Tuple[str, OpamPackageInfo]]: url = self.get_enclosed_single_line_field("url.src:", version) if url is None: raise ValueError( f"can't get field url.src: for version {version} of package {self.opam_package} \ (at url {self.url}) from `opam show`" ) authors_field = self.get_enclosed_single_line_field("authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) - author = Person(fullname=fullname, name=None, email=None) + author = Person.from_fullname(fullname) maintainer_field = self.get_enclosed_single_line_field("maintainer:", version) fullname = b"" if maintainer_field is None else str.encode(maintainer_field) - committer = Person(fullname=fullname, name=None, email=None) + committer = Person.from_fullname(fullname) with Popen(self._opam_show_args(version) + ["--raw"], stdout=PIPE) as proc: assert proc.stdout is not None metadata = proc.stdout.read() yield self.get_package_name(version), OpamPackageInfo( url=url, filename=None, author=author, committer=committer, version=version, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( metadata=metadata, format="opam-package-definition", ) ], ) def build_release( self, p_info: OpamPackageInfo, uncompressed_path: str, directory: Sha1Git, ) -> Optional[Release]: msg = ( f"Synthetic release for OPAM source package {self.opam_package} " f"version {p_info.version}\n" ) return Release( name=p_info.version.encode(), author=p_info.author, message=msg.encode(), date=None, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) diff --git a/swh/loader/package/opam/tests/test_opam.py b/swh/loader/package/opam/tests/test_opam.py index 2e53ee7..ade79ef 100644 --- a/swh/loader/package/opam/tests/test_opam.py +++ b/swh/loader/package/opam/tests/test_opam.py @@ -1,361 +1,355 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package import __version__ from swh.loader.package.loader import RawExtrinsicMetadataCore from swh.loader.package.opam.loader import OpamLoader, OpamPackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult OCB_METADATA = b"""\ opam-version: "2.0" name: "ocb" version: "0.1" synopsis: "SVG badge generator" description: "An OCaml library for SVG badge generation. There\'s also a command-line tool provided." maintainer: "OCamlPro " authors: "OCamlPro " license: "ISC" homepage: "https://ocamlpro.github.io/ocb/" doc: "https://ocamlpro.github.io/ocb/api/" bug-reports: "https://github.com/OCamlPro/ocb/issues" depends: [ "ocaml" {>= "4.05"} "dune" {>= "2.0"} "odoc" {with-doc} ] build: [ ["dune" "subst"] {dev} [ "dune" "build" "-p" name "-j" jobs "@install" "@runtest" {with-test} "@doc" {with-doc} ] ] dev-repo: "git+https://github.com/OCamlPro/ocb.git" url { src: "https://github.com/OCamlPro/ocb/archive/0.1.tar.gz" checksum: [ "sha256=aa27684fbda1b8036ae7e3c87de33a98a9cd2662bcc91c8447e00e41476b6a46" "sha512=1260344f184dd8c8074b0439dbcc8a5d59550a654c249cd61913d4c150c664f37b76195ddca38f7f6646d08bddb320ceb8d420508450b4f09a233cd5c22e6b9b" ] } """ # noqa def test_opam_loader_no_opam_repository_fails(swh_storage, tmpdir, datadir): """Running opam loader without a prepared opam repository fails""" opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=False, # The opam directory must be present ) # No opam root directory init directory from loader. So, at the opam root does not # exist, the loading fails. That's the expected use for the production workers # (whose opam_root maintenance will be externally managed). actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} def test_opam_loader_one_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("e1159446b00745ba4daa7ee26d74fbd81ecc081c") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) release_id = hash_to_bytes("d4d8d3df4f34609a3eeabd48aea49002c5f54f41") expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch(target=b"agrid.0.1", target_type=TargetType.ALIAS,), b"agrid.0.1": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([release_id])[0] == Release( name=b"0.1", message=b"Synthetic release for OPAM source package agrid version 0.1\n", target=hash_to_bytes("00412ee5bc601deb462e55addd1004715116785e"), target_type=ModelObjectType.DIRECTORY, synthetic=True, - author=Person( - fullname=b"OCamlPro ", name=None, email=None - ), + author=Person.from_fullname(b"OCamlPro "), date=None, id=release_id, ) stats = get_stats(swh_storage) assert { "content": 18, "directory": 8, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_opam_loader_many_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "directories" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("f498f7f3b0edbce5cf5834b487a4f8360f6a6a43") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"directories.0.3", target_type=TargetType.ALIAS, ), b"directories.0.1": SnapshotBranch( target=hash_to_bytes("1c88d466b3d57a619e296999322d096fa37bb1c2"), target_type=TargetType.RELEASE, ), b"directories.0.2": SnapshotBranch( target=hash_to_bytes("d6f30684039ad485511a138e2ae504ff67a13075"), target_type=TargetType.RELEASE, ), b"directories.0.3": SnapshotBranch( target=hash_to_bytes("6cf92c0ff052074e69ac18809a9c8198bcc2e746"), target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) check_snapshot(expected_snapshot, swh_storage) def test_opam_release(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("8ba39f050243a72ca667c5587a87413240cbaa47") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } info_iter = loader.get_package_info("0.1") branch_name, package_info = next(info_iter) expected_branch_name = "ocb.0.1" expected_package_info = OpamPackageInfo( url="https://github.com/OCamlPro/ocb/archive/0.1.tar.gz", filename=None, - author=Person( - fullname=b"OCamlPro ", name=None, email=None - ), - committer=Person( - fullname=b"OCamlPro ", name=None, email=None - ), + author=Person.from_fullname(b"OCamlPro "), + committer=Person.from_fullname(b"OCamlPro "), version="0.1", directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( metadata=OCB_METADATA, format="opam-package-definition", ) ], ) assert branch_name == expected_branch_name assert package_info == expected_package_info release_id = hash_to_bytes("c231e541eb29c712635ada394b04127ac69e9fb0") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch(target=b"ocb.0.1", target_type=TargetType.ALIAS,), b"ocb.0.1": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([release_id])[0] assert release is not None assert release.author == expected_package_info.author def test_opam_metadata(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" expected_release_id = hash_to_bytes("c231e541eb29c712635ada394b04127ac69e9fb0") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch(target=b"ocb.0.1", target_type=TargetType.ALIAS,), b"ocb.0.1": SnapshotBranch( target=expected_release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=expected_release_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=opam_url, ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.opam.loader.OpamLoader", version=__version__, ), discovery_date=loader.visit_date, format="opam-package-definition", metadata=OCB_METADATA, origin=url, release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index 777ff50..0656eca 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,185 +1,185 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import functools import itertools import logging import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(["sha1", "sha256", "length"]) -EMPTY_AUTHOR = Person(fullname=b"", name=None, email=None,) +EMPTY_AUTHOR = Person.from_fullname(b"") def api_info(url: str, **extra_params) -> bytes: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: NotFound in case of query failures (for some reasons: 404, ...) Returns: The associated response's information """ response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) if response.status_code != 200: raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") return response.content def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) if fnames and "utf-8''" in fnames[0].lower(): # RFC 5987 fname = re.sub("utf-8''", "", fnames[0], flags=re.IGNORECASE) fname = unquote(fname) elif fnames: fname = fnames[0] if fname: fname = os.path.basename(fname.strip().strip('"')) return fname def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) if response.status_code != 200: raise ValueError( "Fail to query '%s'. Reason: %s" % (url, response.status_code) ) # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"] ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( "Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest) ) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version TReturn = TypeVar("TReturn") TSelf = TypeVar("TSelf") _UNDEFINED = object() def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: cache_name = f"_cached_{f.__name__}" @functools.wraps(f) def newf(self): value = getattr(self, cache_name, _UNDEFINED) if value is _UNDEFINED: value = f(self) setattr(self, cache_name, value) return value return newf