diff --git a/MANIFEST.in b/MANIFEST.in index 8728027..d9875b2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,8 @@ include Makefile include requirements.txt include requirements-swh.txt include version.txt include README.md recursive-include swh/loader/package/tests/ *.tar.gz recursive-include swh py.typed +recursive-include swh/loader/package/tests/data/ * diff --git a/requirements-swh.txt b/requirements-swh.txt index 9fc5250..800086b 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,4 @@ +swh.core >= 0.0.75 swh.model >= 0.0.18 -swh.storage >= 0.0.133 +swh.storage >= 0.0.153 +swh.deposit diff --git a/requirements-test.txt b/requirements-test.txt index 5821059..a582a1e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,4 @@ pytest requests_mock +swh-core[testing] +swh-scheduler[testing] diff --git a/requirements.txt b/requirements.txt index 36f72d8..35eb9f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner retrying psutil requests +iso8601 +pkginfo +python-debian diff --git a/swh/loader/package/__init__.py b/swh/loader/package/__init__.py index e69de29..a8b4a14 100644 --- a/swh/loader/package/__init__.py +++ b/swh/loader/package/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +try: + from swh.loader.core._version import __version__ +except ImportError: + __version__ = 'devel' + + +DEFAULT_PARAMS = { + 'headers': { + 'User-Agent': 'Software Heritage Loader (%s)' % ( + __version__ + ) + } +} diff --git a/swh/loader/package/archive.py b/swh/loader/package/archive.py new file mode 100644 index 0000000..6c3557f --- /dev/null +++ b/swh/loader/package/archive.py @@ -0,0 +1,135 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import iso8601 +import logging + +from os import path +from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import release_name +from swh.model.identifiers import normalize_timestamp + + +logger = logging.getLogger(__name__) +SWH_PERSON = { + 'name': b'Software Heritage', + 'fullname': b'Software Heritage', + 'email': b'robot@softwareheritage.org' +} +REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' + + +class ArchiveLoader(PackageLoader): + visit_type = 'tar' + + def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]], + identity_artifact_keys: Optional[Sequence[str]] = None): + """Loader constructor. + + For now, this is the lister's task output. + + Args: + url: Origin url + artifacts: List of artifact information with keys: + + **time**: last modification time as either isoformat date string + or timestamp + **url**: the artifact url to retrieve filename + **artifact's filename version**: artifact's version length + **length**: artifact's length + + identity_artifact_keys: Optional List of keys forming the + "identity" of an artifact + + """ + super().__init__(url=url) + self.artifacts = artifacts # assume order is enforced in the lister + if not identity_artifact_keys: + # default keys for gnu + identity_artifact_keys = ['time', 'url', 'length', 'version'] + self.identity_artifact_keys = identity_artifact_keys + + def get_versions(self) -> Sequence[str]: + versions = [] + for archive in self.artifacts: + v = archive.get('version') + if v: + versions.append(v) + return versions + + def get_default_version(self) -> str: + # It's the most recent, so for this loader, it's the last one + return self.artifacts[-1]['version'] + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + for a_metadata in self.artifacts: + url = a_metadata['url'] + package_version = a_metadata['version'] + if version == package_version: + filename = a_metadata.get('filename') + p_info = { + 'url': url, + 'filename': filename if filename else path.split(url)[-1], + 'raw': a_metadata, + } + # FIXME: this code assumes we have only 1 artifact per + # versioned package + yield release_name(version), p_info + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + identity = artifact_identity( + artifact_metadata, id_keys=self.identity_artifact_keys) + for rev_id, known_artifact in known_artifacts.items(): + logging.debug('known_artifact: %s', known_artifact) + reference_artifact = known_artifact['extrinsic']['raw'] + known_identity = artifact_identity( + reference_artifact, id_keys=self.identity_artifact_keys) + if identity == known_identity: + return rev_id + + def build_revision(self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict: + time = a_metadata['time'] # assume it's a timestamp + if isinstance(time, str): # otherwise, assume it's a parsable date + time = iso8601.parse_date(time) + normalized_time = normalize_timestamp(time) + return { + 'type': 'tar', + 'message': REVISION_MESSAGE, + 'date': normalized_time, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, + 'committer_date': normalized_time, + 'parents': [], + 'metadata': { + 'intrinsic': {}, + 'extrinsic': { + 'provider': self.url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } + + +def artifact_identity(d: Mapping[str, Any], + id_keys: Sequence[str]) -> Sequence[Any]: + """Compute the primary key for a dict using the id_keys as primary key + composite. + + Args: + d: A dict entry to compute the primary key on + id_keys: Sequence of keys to use as primary key + + Returns: + The identity for that dict entry + + """ + return [d.get(k) for k in id_keys] diff --git a/swh/loader/package/debian.py b/swh/loader/package/debian.py new file mode 100644 index 0000000..cd21630 --- /dev/null +++ b/swh/loader/package/debian.py @@ -0,0 +1,395 @@ +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import email.utils +import iso8601 +import logging +import re +import subprocess + +from dateutil.parser import parse as parse_date +from debian.changelog import Changelog +from debian.deb822 import Dsc +from os import path +from typing import ( + Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple +) + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import download, release_name + + +logger = logging.getLogger(__name__) +UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*') + + +class DebianLoader(PackageLoader): + """Load debian origins into swh archive. + + """ + visit_type = 'debian' + + def __init__(self, url: str, date: str, packages: Mapping[str, Any]): + """Debian Loader implementation. + + Args: + url: Origin url (e.g. deb://Debian/packages/cicero) + date: Ignored + packages: versioned packages and associated artifacts, example:: + + { + 'stretch/contrib/0.7.2-3': { + 'name': 'cicero', + 'version': '0.7.2-3' + 'files': { + 'cicero_0.7.2-3.diff.gz': { + 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', + 'name': 'cicero_0.7.2-3.diff.gz', + 'sha256': 'f039c9642fe15c75bed5254315e2a29f...', + 'size': 3964, + 'uri': 'http://d.d.o/cicero_0.7.2-3.diff.gz', + }, + 'cicero_0.7.2-3.dsc': { + 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', + 'name': 'cicero_0.7.2-3.dsc', + 'sha256': '35b7f1048010c67adfd8d70e4961aefb...', + 'size': 1864, + 'uri': 'http://d.d.o/cicero_0.7.2-3.dsc', + }, + 'cicero_0.7.2.orig.tar.gz': { + 'md5sum': '4353dede07c5728319ba7f5595a7230a', + 'name': 'cicero_0.7.2.orig.tar.gz', + 'sha256': '63f40f2436ea9f67b44e2d4bd669dbab...', + 'size': 96527, + 'uri': 'http://d.d.o/cicero_0.7.2.orig.tar.gz', + } + }, + }, + # ... + } + + """ + super().__init__(url=url) + self.packages = packages + + def get_versions(self) -> Sequence[str]: + """Returns the keys of the packages input (e.g. + stretch/contrib/0.7.2-3, etc...) + + """ + return self.packages.keys() + + def get_default_version(self) -> str: + """No default version for debian so no HEAD alias in snapshot. + + """ + return None + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + meta = self.packages[version] + p_info = meta.copy() + p_info['raw'] = meta + yield release_name(version), p_info + + def resolve_revision_from( + self, known_package_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + artifacts_to_fetch = artifact_metadata['files'] + logger.debug('k_p_artifacts: %s', known_package_artifacts) + logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch) + for rev_id, known_artifacts in known_package_artifacts.items(): + logger.debug('Revision: %s', rev_id) + logger.debug('Associated known_artifacts: %s', known_artifacts) + known_artifacts = known_artifacts['extrinsic']['raw']['files'] + rev_found = True + for a_name, k_artifact in known_artifacts.items(): + artifact_to_fetch = artifacts_to_fetch.get(a_name) + logger.debug('artifact_to_fetch: %s', artifact_to_fetch) + if artifact_to_fetch is None: + # as soon as we do not see an artifact, we consider we need + # to check the other revision + rev_found = False + if k_artifact['sha256'] != artifact_to_fetch['sha256']: + # Hash is different, we consider we need to check the other + # revisions + rev_found = False + if rev_found: + logger.debug('Existing revision %s found for new artifacts.', + rev_id) + return rev_id + # if we pass here, we did not find any known artifacts + logger.debug('No existing revision found for the new artifacts.') + + def download_package(self, p_info: Mapping[str, Any], + tmpdir: str) -> [Tuple[str, Dict]]: + """Contrary to other package loaders (1 package, 1 artifact), + `a_metadata` represents the package's datafiles set to fetch: + - .orig.tar.gz + - .dsc + - .diff.gz + + This is delegated to the `download_package` function. + + """ + all_hashes = download_package(p_info, tmpdir) + logger.debug('all_hashes: %s', all_hashes) + res = [] + for hashes in all_hashes.values(): + res.append((tmpdir, hashes)) + logger.debug('res: %s', res) + return res + + def uncompress(self, dl_artifacts: [Tuple[str, Dict]], dest: str) -> str: + logger.debug('dl_artifacts: %s', dl_artifacts) + return extract_package(dl_artifacts, dest=dest) + + def build_revision(self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict: + dsc_url, dsc_name = dsc_information(a_metadata) + dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) + i_metadata = get_package_metadata( + a_metadata, dsc_path, uncompressed_path) + + logger.debug('i_metadata: %s', i_metadata) + logger.debug('a_metadata: %s', a_metadata) + + msg = 'Synthetic revision for Debian source package %s version %s' % ( + a_metadata['name'], a_metadata['version']) + + date = iso8601.parse_date(i_metadata['changelog']['date']) + author = prepare_person(i_metadata['changelog']['person']) + + # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa + return { + 'type': 'dsc', + 'message': msg.encode('utf-8'), + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'dsc', + 'raw': i_metadata, + }, + 'extrinsic': { + 'provider': dsc_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + } + } + + +def uid_to_person(uid: str) -> Mapping[str, str]: + """Convert an uid to a person suitable for insertion. + + Args: + uid: an uid of the form "Name " + + Returns: + a dictionary with the following keys: + + - name: the name associated to the uid + - email: the mail associated to the uid + - fullname: the actual uid input + + """ + logger.debug('uid: %s', uid) + ret = { + 'name': '', + 'email': '', + 'fullname': uid, + } + + name, mail = email.utils.parseaddr(uid) + if name and email: + ret['name'] = name + ret['email'] = mail + else: + ret['name'] = uid + return ret + + +def prepare_person(person: Mapping[str, str]) -> Mapping[str, bytes]: + """Prepare person for swh serialization... + + Args: + A person dict + + Returns: + A person dict ready for storage + + """ + ret = {} + for key, value in person.items(): + ret[key] = value.encode('utf-8') + return ret + + +def download_package( + package: Mapping[str, Any], tmpdir: Any) -> Mapping[str, Any]: + """Fetch a source package in a temporary directory and check the checksums + for all files. + + Args: + package: Dict defining the set of files representing a debian package + tmpdir: Where to download and extract the files to ingest + + Returns: + Dict of swh hashes per filename key + + """ + all_hashes = {} + for filename, fileinfo in package['files'].items(): + uri = fileinfo['uri'] + logger.debug('fileinfo: %s', fileinfo) + extrinsic_hashes = {'sha256': fileinfo['sha256']} + logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes) + filepath, hashes = download(uri, dest=tmpdir, filename=filename, + hashes=extrinsic_hashes) + all_hashes[filename] = hashes + + logger.debug('all_hashes: %s', all_hashes) + return all_hashes + + +def dsc_information(package: Mapping[str, Any]) -> Tuple[str, str]: + """Retrieve dsc information from a package. + + Args: + package: Package metadata information + + Returns: + Tuple of dsc file's uri, dsc's full disk path + + """ + dsc_name = None + dsc_url = None + for filename, fileinfo in package['files'].items(): + if filename.endswith('.dsc'): + if dsc_name: + raise ValueError( + 'Package %s_%s references several dsc files' % + (package['name'], package['version']) + ) + dsc_url = fileinfo['uri'] + dsc_name = filename + + return dsc_url, dsc_name + + +def extract_package(dl_artifacts: List[Tuple[str, Dict]], dest: str) -> str: + """Extract a Debian source package to a given directory. + + Note that after extraction the target directory will be the root of the + extracted package, rather than containing it. + + Args: + package: package information dictionary + dest: directory where the package files are stored + + Returns: + Package extraction directory + + """ + a_path = dl_artifacts[0][0] + logger.debug('dl_artifacts: %s', dl_artifacts) + for _, hashes in dl_artifacts: + logger.debug('hashes: %s', hashes) + filename = hashes['filename'] + if filename.endswith('.dsc'): + dsc_name = filename + break + + dsc_path = path.join(a_path, dsc_name) + destdir = path.join(dest, 'extracted') + logfile = path.join(dest, 'extract.log') + logger.debug('extract Debian source package %s in %s' % + (dsc_path, destdir), extra={ + 'swh_type': 'deb_extract', + 'swh_dsc': dsc_path, + 'swh_destdir': destdir, + }) + + cmd = ['dpkg-source', + '--no-copy', '--no-check', + '--ignore-bad-version', + '-x', dsc_path, + destdir] + + try: + with open(logfile, 'w') as stdout: + subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + logdata = open(logfile, 'r').read() + raise ValueError('dpkg-source exited with code %s: %s' % + (e.returncode, logdata)) from None + + return destdir + + +def get_package_metadata(package: Mapping[str, Any], dsc_path: str, + extracted_path: str) -> Mapping[str, Any]: + """Get the package metadata from the source package at dsc_path, + extracted in extracted_path. + + Args: + package: the package dict (with a dsc_path key) + dsc_path: path to the package's dsc file + extracted_path: the path where the package got extracted + + Returns: + dict: a dictionary with the following keys: + + - history: list of (package_name, package_version) tuples parsed from + the package changelog + + """ + with open(dsc_path, 'rb') as dsc: + parsed_dsc = Dsc(dsc) + + # Parse the changelog to retrieve the rest of the package information + changelog_path = path.join(extracted_path, 'debian/changelog') + with open(changelog_path, 'rb') as changelog: + try: + parsed_changelog = Changelog(changelog) + except UnicodeDecodeError: + logger.warning('Unknown encoding for changelog %s,' + ' falling back to iso' % + changelog_path.decode('utf-8'), extra={ + 'swh_type': 'deb_changelog_encoding', + 'swh_name': package['name'], + 'swh_version': str(package['version']), + 'swh_changelog': changelog_path.decode('utf-8'), + }) + + # need to reset as Changelog scrolls to the end of the file + changelog.seek(0) + parsed_changelog = Changelog(changelog, encoding='iso-8859-15') + + package_info = { + 'name': package['name'], + 'version': str(package['version']), + 'changelog': { + 'person': uid_to_person(parsed_changelog.author), + 'date': parse_date(parsed_changelog.date).isoformat(), + 'history': [(block.package, str(block.version)) + for block in parsed_changelog][1:], + } + } + + maintainers = [ + uid_to_person(parsed_dsc['Maintainer']), + ] + maintainers.extend( + uid_to_person(person) + for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', '')) + ) + package_info['maintainers'] = maintainers + + return package_info diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py new file mode 100644 index 0000000..160819c --- /dev/null +++ b/swh/loader/package/deposit.py @@ -0,0 +1,154 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +from typing import Dict, Generator, Mapping, Sequence, Tuple + +from swh.model.hashutil import hash_to_hex +from swh.loader.package.loader import PackageLoader +from swh.deposit.client import PrivateApiDepositClient as ApiClient + + +logger = logging.getLogger(__name__) + + +class DepositLoader(PackageLoader): + """Load pypi origin's artifact releases into swh archive. + + """ + visit_type = 'deposit' + + def __init__(self, url: str, deposit_id: str): + """Constructor + + Args: + url: Origin url to associate the artifacts/metadata to + deposit_id: Deposit identity + + """ + super().__init__(url=url) + + # For now build back existing api urls + # archive_url: Private api url to retrieve archive artifact + self.archive_url = '/%s/raw/' % deposit_id + # metadata_url: Private api url to retrieve the deposit metadata + self.metadata_url = '/%s/meta/' % deposit_id + # deposit_update_url: Private api to push pids and status update on the + # deposit id + self.deposit_update_url = '/%s/update/' % deposit_id + self.client = ApiClient() + self._metadata = None + + @property + def metadata(self): + if self._metadata is None: + self._metadata = self.client.metadata_get(self.metadata_url) + return self._metadata + + def get_versions(self) -> Sequence[str]: + # only 1 branch 'HEAD' with no alias since we only have 1 snapshot + # branch + return ['HEAD'] + + def get_package_info(self, version: str) -> Generator[ + Tuple[Mapping[str, str], Dict], None, None]: + p_info = { + 'url': self.client.base_url + self.archive_url, + 'filename': 'archive.zip', + 'raw': self.metadata, + } + yield 'HEAD', p_info + + def build_revision( + self, a_metadata: Dict, uncompressed_path: str) -> Dict: + revision = a_metadata.pop('revision') + metadata = { + 'extrinsic': { + 'provider': '%s/%s' % ( + self.client.base_url, self.metadata_url), + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + } + + # FIXME: the deposit no longer needs to build the revision + revision['metadata'].update(metadata) + revision['author'] = parse_author(revision['author']) + revision['committer'] = parse_author(revision['committer']) + revision['message'] = revision['message'].encode('utf-8') + revision['type'] = 'tar' + + return revision + + def load(self) -> Dict: + # Usual loading + r = super().load() + success = r['status'] != 'failed' + + if success: + # Update archive with metadata information + origin_metadata = self.metadata['origin_metadata'] + + logger.debug('origin_metadata: %s', origin_metadata) + tools = self.storage.tool_add([origin_metadata['tool']]) + logger.debug('tools: %s', tools) + tool_id = tools[0]['id'] + + provider = origin_metadata['provider'] + # FIXME: Shall we delete this info? + provider_id = self.storage.metadata_provider_add( + provider['provider_name'], + provider['provider_type'], + provider['provider_url'], + metadata=None) + + metadata = origin_metadata['metadata'] + self.storage.origin_metadata_add( + self.url, self.visit_date, provider_id, tool_id, metadata) + + # Update deposit status + try: + if not success: + self.client.status_update( + self.deposit_update_url, status='failed') + return r + + snapshot_id = r['snapshot_id'] + branches = self.storage.snapshot_get(snapshot_id)['branches'] + logger.debug('branches: %s', branches) + if not branches: + return r + rev_id = branches[b'HEAD']['target'] + + revision = next(self.storage.revision_get([rev_id])) + + # Retrieve the revision identifier + dir_id = revision['directory'] + + # update the deposit's status to success with its + # revision-id and directory-id + self.client.status_update( + self.deposit_update_url, + status='done', + revision_id=hash_to_hex(rev_id), + directory_id=hash_to_hex(dir_id), + origin_url=self.url) + except Exception: + logger.exception( + 'Problem when trying to update the deposit\'s status') + return {'status': 'failed'} + return r + + +def parse_author(author): + """See prior fixme + + """ + return { + 'fullname': author['fullname'].encode('utf-8'), + 'name': author['name'].encode('utf-8'), + 'email': author['email'].encode('utf-8'), + } diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 4933533..6e75cb3 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,477 +1,389 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime +import logging +import tempfile import os -import shutil -import requests -try: - from _version import __version__ # type: ignore -except ImportError: - __version__ = 'devel' - - -from tempfile import mkdtemp +from typing import ( + Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple +) -from swh.core import tarball -from swh.loader.core.utils import clean_dangling_folders -from swh.loader.core.loader import BufferedLoader -from swh.model.identifiers import normalize_timestamp -from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.core.tarball import uncompress +from swh.core.config import SWHConfig from swh.model.from_disk import Directory - from swh.model.identifiers import ( - identifier_to_bytes, revision_identifier, snapshot_identifier + revision_identifier, snapshot_identifier, identifier_to_bytes ) +from swh.storage import get_storage +from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.loader.core.converters import content_for_storage +from swh.loader.package.utils import download -DEBUG_MODE = '** DEBUG MODE **' +logger = logging.getLogger(__name__) -class GNULoader(BufferedLoader): - SWH_PERSON = { - 'name': b'Software Heritage', - 'fullname': b'Software Heritage', - 'email': b'robot@softwareheritage.org' - } - REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' +# Not implemented yet: +# - clean up disk routines from previous killed workers (when OOMkilled) +# -> separation of concern would like this to be abstracted from the code +# -> experience tells us it's complicated to do as such (T903, T964, T982, +# etc...) +# +# - model: swh.model.merkle.from_disk should output swh.model.model.* objects +# to avoid this layer's conversion routine call +# -> Take this up within swh.model's current implementation - visit_type = 'gnu' - def __init__(self): - self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.gnu.' - super().__init__(logging_class='swh.loader.package.GNULoader') +class PackageLoader: + # Origin visit type (str) set by the loader + visit_type = '' - self.dir_path = None - temp_directory = self.config['temp_directory'] - os.makedirs(temp_directory, exist_ok=True) - - self.temp_directory = mkdtemp( - suffix='-%s' % os.getpid(), - prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, - dir=temp_directory) - - self.debug = self.config.get('debug', False) - self.session = requests.session() - self.params = { - 'headers': { - 'User-Agent': 'Software Heritage Loader (%s)' % ( - __version__ - ) - } - } - - def pre_cleanup(self): - """To prevent disk explosion if some other workers exploded - in mid-air (OOM killed), we try and clean up dangling files. - - """ - if self.debug: - self.log.warning('%s Will not pre-clean up temp dir %s' % ( - DEBUG_MODE, self.temp_directory - )) - return - clean_dangling_folders(self.temp_directory, - pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, - log=self.log) - - def prepare_origin_visit(self, name, origin_url, **kwargs): - """Prepare package visit. + def __init__(self, url): + """Loader's constructor. This raises exception if the minimal required + configuration is missing (cf. fn:`check` method). Args: - name (str): Package Name - origin_url (str): Package origin url - **kwargs: Arbitrary keyword arguments passed by the lister. + url (str): Origin url to load data from """ - # reset statuses - self._load_status = 'uneventful' - self._visit_status = 'full' - self.done = False - - self.origin = { - 'url': origin_url, - 'type': self.visit_type, - } + # This expects to use the environment variable SWH_CONFIG_FILENAME + self.config = SWHConfig.parse_config_file() + self._check_configuration() + self.storage = get_storage(**self.config['storage']) + self.url = url + self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - self.visit_date = None # loader core will populate it + def _check_configuration(self): + """Checks the minimal configuration required is set for the loader. - def prepare(self, name, origin_url, **kwargs): - """Prepare effective loading of source tarballs for a package manager - package. - - Args: - name (str): Package Name - origin_url (str): Package origin url - **kwargs: Arbitrary keyword arguments passed by the lister. + If some required configuration is missing, exception detailing the + issue is raised. """ - self.package_contents = [] - self.package_directories = [] - self.package_revisions = [] - self.all_version_data = [] - self.latest_timestamp = 0 - # Conceled the data into one dictionary to eliminate the need of - # passing all the parameters when required in some method - self.package_details = { - 'name': name, - 'origin_url': origin_url, - 'tarballs': kwargs['tarballs'], - } + if 'storage' not in self.config: + raise ValueError( + 'Misconfiguration, at least the storage key should be set') - self.package_temp_dir = os.path.join(self.temp_directory, - self.package_details['name']) + def get_versions(self) -> Sequence[str]: + """Return the list of all published package versions. - self.new_versions = \ - self.prepare_package_versions(self.package_details['tarballs']) + Returns: + Sequence of published versions - def prepare_package_versions(self, tarballs): """ - Instantiate a generator that will process a specific package release - version at each iteration step. The following operations will be - performed: - - 1. Create a temporary directory to download and extract the - release tarball. - 2. Download the tarball. - 3. Uncompress the tarball. - 4. Parse the file associated to the package version to extract - metadata (optional). - 5. Delete unnecessary files (optional). + return [] - Args: - tarballs (list): a list of dicts containing information about the - respective tarball that is provided by lister. - known_versions (dict): may be provided by the loader, it enables - to filter out versions already ingested in the archive. + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + """Given a release version of a package, retrieve the associated + package information for such version. - Yields: - Tuple[dict, str]: tuples containing the following - members: + Args: + version: Package version - * a dict holding package tarball information and metadata - * a string holding the path of the uncompressed package to - load into the archive + Returns: + (branch name, package metadata) """ - for package_version_data in tarballs: - - tarball_url = package_version_data['archive'] - tarball_request = self._request(tarball_url, - throw_error=False) - if tarball_request.status_code == 404: - self.log.warning('Tarball url %s returns a 404 error.', - tarball_url) - self._visit_status = 'partial' - # FIX ME: Do we need to mark it `partial` here - continue + yield from {} - yield self._prepare_package_version(package_version_data, - tarball_request) + def build_revision( + self, a_metadata: Dict, i_metadata: Dict) -> Dict: + """Build the revision dict from the archive metadata (extrinsic + artifact metadata) and the intrinsic metadata. - def _request(self, url, throw_error=True): - """Request the remote tarball url. + Returns: + SWH data dict - Args: - url (str): Url (file or http*). + """ + return {} - Raises: - ValueError in case of failing to query. + def get_default_version(self) -> str: + """Retrieve the latest release version Returns: - Tuple of local (filepath, hashes of filepath). + Latest version """ - response = self.session.get(url, **self.params, stream=True) - if response.status_code != 200 and throw_error: - raise ValueError("Fail to query '%s'. Reason: %s" % ( - url, response.status_code)) - - return response + return '' - def _prepare_package_version(self, package_version_data, tarball_request): - """Process the package release version. + def last_snapshot(self) -> Optional[Dict]: + """Retrieve the last snapshot - The following operations are performed: + """ + visit = self.storage.origin_visit_get_latest( + self.url, require_snapshot=True) + if visit: + return snapshot_get_all_branches( + self.storage, visit['snapshot']) - 1. Download the tarball - 2. Uncompress the tarball - 3. Delete unnecessary files (optional) - 4. Parse the file associated to the package version to extract - metadata (optional) + def known_artifacts(self, snapshot: Dict) -> [Dict]: + """Retrieve the known releases/artifact for the origin. - Args: - package_version_data (dict): containing information - about the focused package version. - known_versions (dict): may be provided by the loader, it enables - to filter out versions already ingested in the archive. + Args + snapshot: snapshot for the visit - Return: - Tuple[dict, str]: tuples containing the following - members: - - * a dict holding package tarball information and metadata - * a string holding the path of the uncompressed package to - load into the archive + Returns: + Dict of keys revision id (bytes), values a metadata Dict. """ - url = package_version_data['archive'] - tarball_path, hashes = self.download_generate_hash(tarball_request, - url) - uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed', - os.path.basename(url)) # SEE ME - self.uncompress_tarball(tarball_path, uncompressed_path) - - # remove tarball - os.remove(tarball_path) + if not snapshot or 'branches' not in snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + + ret = {} + for revision in known_revisions: + if not revision: # revision_get can return None + continue + ret[revision['id']] = revision['metadata'] - if self.tarball_invalid: - return None, None + return ret - return package_version_data, uncompressed_path + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + """Resolve the revision from a snapshot and an artifact metadata dict. - def download_generate_hash(self, response, url): - """Store file in temp directory and computes hash of its filepath. + If the artifact has already been downloaded, this will return the + existing revision targeting that uncompressed artifact directory. + Otherwise, this returns None. Args: - response (Response): Server response of the url - url (str): Url of the tarball + snapshot: Snapshot + artifact_metadata: Information dict Returns: - Tuple of local (filepath, hashes of filepath) + None or revision identifier """ - length = int(response.headers['content-length']) - os.makedirs(self.package_temp_dir, exist_ok=True) - # SEE ME - filepath = os.path.join(self.package_temp_dir, os.path.basename(url)) - - # Convert the server response to a file. - h = MultiHash(length=length) - with open(filepath, 'wb') as f: - for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): - h.update(chunk) - f.write(chunk) - - # Check for the validity of the tarball downloaded. - actual_length = os.path.getsize(filepath) - if length != actual_length: - raise ValueError('Error when checking size: %s != %s' % ( - length, actual_length)) - - hashes = { - 'length': length, - **h.hexdigest() - } - return filepath, hashes + return None - def uncompress_tarball(self, filepath, path): - """Uncompress a tarball. - - Args: - filepath (str): Path of tarball to uncompress - path (str): The destination folder where to uncompress the tarball - Returns: - The nature of the tarball, zip or tar. + def download_package(self, p_info: Mapping[str, Any], + tmpdir: str) -> [Tuple[str, Dict]]: + """Download artifacts for a specific package. All downloads happen in + in the tmpdir folder. - """ - try: - self.tarball_invalid = False - tarball.uncompress(filepath, path) - except Exception: - self.tarball_invalid = True - self._visit_status = 'partial' + Default implementation expects the artifacts package info to be + about one artifact per package. - def fetch_data(self): - """Called once per release artifact version (can be many for one - release). + Note that most implementation have 1 artifact per package. But some + implementation have multiple artifacts per package (debian), some have + none, the package is the artifact (gnu). - This will for each call: - - retrieve a release artifact (associated to a release version) - - Computes the swh objects + Args: + artifacts_package_info: Information on the package artifacts to + download (url, filename, etc...) + tmpdir: Location to retrieve such artifacts Returns: - True as long as data to fetch exist + List of (path, computed hashes) """ - data = None - if self.done: - return False + a_uri = p_info['url'] + filename = p_info.get('filename') + return [download(a_uri, dest=tmpdir, filename=filename)] - try: - data = next(self.new_versions) - self._load_status = 'eventful' - except StopIteration: - self.done = True - return False - - package_version_data, dir_path = data - - # package release tarball was corrupted - if self.tarball_invalid: - return not self.done - - dir_path = dir_path.encode('utf-8') - directory = Directory.from_disk(path=dir_path, data=True) - objects = directory.collect() - - if 'content' not in objects: - objects['content'] = {} - if 'directory' not in objects: - objects['directory'] = {} - - self.package_contents = objects['content'].values() - self.package_directories = objects['directory'].values() - - revision = self.build_revision(directory, - package_version_data) - - revision['id'] = identifier_to_bytes( - revision_identifier(revision)) - self.package_revisions.append(revision) - self.log.debug(revision) - package_version_data['id'] = revision['id'] - self.all_version_data.append(package_version_data) - - # To find the latest version - if self.latest_timestamp < int(package_version_data['date']): - self.latest_timestamp = int(package_version_data['date']) - - self.log.debug('Removing unpacked package files at %s', dir_path) - shutil.rmtree(dir_path) - - return not self.done - - def build_revision(self, directory, package_version_data): - normalize_date = normalize_timestamp(int(package_version_data['date'])) - return { - 'metadata': { - 'package': { - 'date': package_version_data['date'], - 'archive': package_version_data['archive'], - }, - }, - 'date': normalize_date, - 'committer_date': normalize_date, - 'author': self.SWH_PERSON, - 'committer': self.SWH_PERSON, - 'type': 'tar', - 'message': self.REVISION_MESSAGE, - 'directory': directory.hash, - 'synthetic': True, - 'parents': [], - } + def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], + dest: str) -> str: + """Uncompress the artifact(s) in the destination folder dest. - def store_data(self): - """Store fetched data in the database. + Optionally, this could need to use the p_info dict for some more + information (debian). """ - self.maybe_load_contents(self.package_contents) - self.maybe_load_directories(self.package_directories) - self.maybe_load_revisions(self.package_revisions) + uncompressed_path = os.path.join(dest, 'src') + for a_path, _ in dl_artifacts: + uncompress(a_path, dest=uncompressed_path) + return uncompressed_path - if self.done: - self.generate_and_load_snapshot() - self.flush() + def load(self) -> Dict: + """Load for a specific origin the associated contents. - def generate_and_load_snapshot(self): - """Generate and load snapshot for the package visit. + for each package version of the origin - """ - branches = {} - for version_data in self.all_version_data: - branch_name = self.find_branch_name(version_data['archive']) + 1. Fetch the files for one package version By default, this can be + implemented as a simple HTTP request. Loaders with more specific + requirements can override this, e.g.: the PyPI loader checks the + integrity of the downloaded files; the Debian loader has to download + and check several files for one package version. - target = self.target_from_version(version_data['id']) - branches[branch_name] = target - branches = self.find_head(branches, branch_name, - version_data['date']) + 2. Extract the downloaded files By default, this would be a universal + archive/tarball extraction. - if not target: - self._visit_status = 'partial' + Loaders for specific formats can override this method (for instance, + the Debian loader uses dpkg-source -x). - snapshot = { - 'branches': branches, - } + 3. Convert the extracted directory to a set of Software Heritage + objects Using swh.model.from_disk. - snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) - self.maybe_load_snapshot(snapshot) + 4. Extract the metadata from the unpacked directories This would only + be applicable for "smart" loaders like npm (parsing the + package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing + debian/changelog and debian/control). - def find_branch_name(self, url): - """Extract branch name from tarball url + On "minimal-metadata" sources such as the GNU archive, the lister + should provide the minimal set of metadata needed to populate the + revision/release objects (authors, dates) as an argument to the + task. - Args: - url (str): Tarball URL + 5. Generate the revision/release objects for the given version. From + the data generated at steps 3 and 4. - Returns: - byte: Branch name + end for each - Example: - For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz + 6. Generate and load the snapshot for the visit - >>> find_branch_name(url) - b'release/8sync-0.2.0' + Using the revisions/releases collected at step 5., and the branch + information from step 0., generate a snapshot and load it into the + Software Heritage archive """ - branch_name = '' - filename = os.path.basename(url) - filename_parts = filename.split(".") - if len(filename_parts) > 1 and filename_parts[-2] == 'tar': - for part in filename_parts[:-2]: - branch_name += '.' + part - elif len(filename_parts) > 1 and filename_parts[-1] == 'zip': - for part in filename_parts[:-1]: - branch_name += '.' + part + status_load = 'uneventful' # either: eventful, uneventful, failed + status_visit = 'full' # either: partial, full + tmp_revisions = {} # type: Dict[str, List] + snapshot = None - return (('release/%s') % branch_name[1:]).encode('ascii') - - def find_head(self, branches, branch_name, timestamp): - """Make branch head. - - Checks if the current version is the latest version. Make it as head - if it is the latest version. - - Args: - branches (dict): Branches for the focused package. - branch_name (str): Branch name - - Returns: - dict: Branches for the focused package - - """ - if self.latest_timestamp == int(timestamp): - branches[b'HEAD'] = { - 'target_type': 'alias', - 'target': branch_name, + try: + # Prepare origin and origin_visit + origin = {'url': self.url} + self.storage.origin_add_one(origin) + visit_id = self.storage.origin_visit_add( + origin=self.url, + date=self.visit_date, + type=self.visit_type)['visit'] + last_snapshot = self.last_snapshot() + logger.debug('last snapshot: %s', last_snapshot) + known_artifacts = self.known_artifacts(last_snapshot) + logger.debug('known artifacts: %s', known_artifacts) + + # Retrieve the default release version (the "latest" one) + default_version = self.get_default_version() + logger.debug('default version: %s', default_version) + + for version in self.get_versions(): # for each + logger.debug('version: %s', version) + tmp_revisions[version] = [] + # `p_` stands for `package_` + for branch_name, p_info in self.get_package_info(version): + logger.debug('package_info: %s', p_info) + revision_id = self.resolve_revision_from( + known_artifacts, p_info['raw']) + if revision_id is None: + with tempfile.TemporaryDirectory() as tmpdir: + try: + dl_artifacts = self.download_package( + p_info, tmpdir) + except Exception: + logger.exception('Unable to retrieve %s', + p_info) + status_visit = 'partial' + continue + + uncompressed_path = self.uncompress( + dl_artifacts, dest=tmpdir) + logger.debug('uncompressed_path: %s', + uncompressed_path) + + directory = Directory.from_disk( + path=uncompressed_path.encode('utf-8'), + data=True) # noqa + # FIXME: Try not to load the full raw content in + # memory + objects = directory.collect() + + contents = objects['content'].values() + logger.debug('Number of contents: %s', + len(contents)) + + self.storage.content_add( + [content_for_storage(x) for x in contents]) + + status_load = 'eventful' + directories = list(objects['directory'].values()) + + logger.debug('Number of directories: %s', + len(directories)) + + self.storage.directory_add(directories) + + # FIXME: This should be release. cf. D409 + revision = self.build_revision( + p_info['raw'], uncompressed_path) + revision.update({ + 'synthetic': True, + 'directory': directory.hash, + }) + + revision['metadata'].update({ + 'original_artifact': [ + hashes for _, hashes in dl_artifacts + ], + }) + + revision['id'] = revision_id = identifier_to_bytes( + revision_identifier(revision)) + + logger.debug('Revision: %s', revision) + + self.storage.revision_add([revision]) + + tmp_revisions[version].append((branch_name, revision_id)) + + logger.debug('tmp_revisions: %s', tmp_revisions) + # Build and load the snapshot + branches = {} + for version, branch_name_revisions in tmp_revisions.items(): + if version == default_version and \ + len(branch_name_revisions) == 1: + # only 1 branch (no ambiguity), we can create an alias + # branch 'HEAD' + branch_name, _ = branch_name_revisions[0] + # except for some corner case (deposit) + if branch_name != 'HEAD': + branches[b'HEAD'] = { + 'target_type': 'alias', + 'target': branch_name.encode('utf-8'), + } + + for branch_name, target in branch_name_revisions: + branch_name = branch_name.encode('utf-8') + branches[branch_name] = { + 'target_type': 'revision', + 'target': target, + } + + snapshot = { + 'branches': branches } - return branches + logger.debug('snapshot: %s', snapshot) - def target_from_version(self, revision_id): - return { - 'target': revision_id, - 'target_type': 'revision', - } if revision_id else None + snapshot['id'] = identifier_to_bytes( + snapshot_identifier(snapshot)) - def load_status(self): - return { - 'status': self._load_status, + logger.debug('snapshot: %s', snapshot) + self.storage.snapshot_add([snapshot]) + if hasattr(self.storage, 'flush'): + self.storage.flush() + except Exception: + logger.exception('Fail to load %s' % self.url) + status_visit = 'partial' + status_load = 'failed' + finally: + self.storage.origin_visit_update( + origin=self.url, visit_id=visit_id, status=status_visit, + snapshot=snapshot and snapshot['id']) + result = { + 'status': status_load, } - - def visit_status(self): - return self._visit_status - - def cleanup(self): - """Clean up temporary disk use after downloading and extracting - package tarballs. - - """ - if self.debug: - self.log.warning('%s Will not clean up temp dir %s' % ( - DEBUG_MODE, self.temp_directory - )) - return - if os.path.exists(self.temp_directory): - self.log.debug('Clean up %s' % self.temp_directory) - shutil.rmtree(self.temp_directory) + if snapshot: + result['snapshot_id'] = snapshot['id'] + return result diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py new file mode 100644 index 0000000..0bc3ab8 --- /dev/null +++ b/swh/loader/package/npm.py @@ -0,0 +1,295 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +import os +import re + +from codecs import BOM_UTF8 +from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional + +import chardet +import iso8601 + +from swh.model.identifiers import normalize_timestamp +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import api_info, release_name + + +logger = logging.getLogger(__name__) + + +_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} + +# https://github.com/jonschlinkert/author-regex +_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' + + +class NpmLoader(PackageLoader): + visit_type = 'npm' + + def __init__(self, package_name, package_url, package_metadata_url): + super().__init__(url=package_url) + self.provider_url = package_metadata_url + + self._info = None + self._versions = None + + # if package_url is None: + # package_url = 'https://www.npmjs.com/package/%s' % package_name + # if package_metadata_url is None: + # package_metadata_url = 'https://replicate.npmjs.com/%s/' %\ + # quote(package_name, safe='') + + @property + def info(self) -> Dict: + """Return the project metadata information (fetched from npm registry) + + """ + if not self._info: + self._info = api_info(self.provider_url) + return self._info + + def get_versions(self) -> Sequence[str]: + return sorted(list(self.info['versions'].keys())) + + def get_default_version(self) -> str: + return self.info['dist-tags'].get('latest', '') + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + meta = self.info['versions'][version] + url = meta['dist']['tarball'] + p_info = { + 'url': url, + 'filename': os.path.basename(url), + 'raw': meta, + } + yield release_name(version), p_info + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + shasum = artifact_metadata['dist']['shasum'] + for rev_id, known_artifact in known_artifacts.items(): + original_artifact = known_artifact['original_artifact'][0] + if shasum == original_artifact['checksums']['sha1']: + return rev_id + + def build_revision( + self, a_metadata: Dict, uncompressed_path: str) -> Dict: + i_metadata = extract_intrinsic_metadata(uncompressed_path) + # from intrinsic metadata + author = extract_npm_package_author(i_metadata) + # extrinsic metadata + version = i_metadata['version'] + date = self.info['time'][version] + date = iso8601.parse_date(date) + date = normalize_timestamp(int(date.timestamp())) + message = version.encode('ascii') + + return { + 'type': 'tar', + 'message': message, + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'package.json', + 'raw': i_metadata, + }, + 'extrinsic': { + 'provider': self.provider_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } + + +def parse_npm_package_author(author_str): + """ + Parse npm package author string. + + It works with a flexible range of formats, as detailed below:: + + name + name (url) + name (url) + name (url) + name(url) + name (url) + name (url) + name(url) + name(url) + name (url) + name(url) + name + name + (url) + (url) + (url) + (url) + + (url) + + Args: + author_str (str): input author string + + Returns: + dict: A dict that may contain the following keys: + * name + * email + * url + + """ + author = {} + matches = re.findall(_author_regexp, + author_str.replace('<>', '').replace('()', ''), + re.M) + for match in matches: + if match[0].strip(): + author['name'] = match[0].strip() + if match[1].strip(): + author['email'] = match[1].strip() + if match[2].strip(): + author['url'] = match[2].strip() + return author + + +def extract_npm_package_author(package_json): + """ + Extract package author from a ``package.json`` file content and + return it in swh format. + + Args: + package_json (dict): Dict holding the content of parsed + ``package.json`` file + + Returns: + dict: A dict with the following keys: + * fullname + * name + * email + + """ + + def _author_str(author_data): + if type(author_data) is dict: + author_str = '' + if 'name' in author_data: + author_str += author_data['name'] + if 'email' in author_data: + author_str += ' <%s>' % author_data['email'] + return author_str + elif type(author_data) is list: + return _author_str(author_data[0]) if len(author_data) > 0 else '' + else: + return author_data + + author_data = {} + for author_key in ('author', 'authors'): + if author_key in package_json: + author_str = _author_str(package_json[author_key]) + author_data = parse_npm_package_author(author_str) + + name = author_data.get('name') + email = author_data.get('email') + + fullname = None + + if name and email: + fullname = '%s <%s>' % (name, email) + elif name: + fullname = name + + if not fullname: + return _EMPTY_AUTHOR + + if fullname: + fullname = fullname.encode('utf-8') + + if name: + name = name.encode('utf-8') + + if email: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} + + +def _lstrip_bom(s, bom=BOM_UTF8): + if s.startswith(bom): + return s[len(bom):] + else: + return s + + +def load_json(json_bytes): + """ + Try to load JSON from bytes and return a dictionary. + + First try to decode from utf-8. If the decoding failed, + try to detect the encoding and decode again with replace + error handling. + + If JSON is malformed, an empty dictionary will be returned. + + Args: + json_bytes (bytes): binary content of a JSON file + + Returns: + dict: JSON data loaded in a dictionary + """ + json_data = {} + try: + json_str = _lstrip_bom(json_bytes).decode('utf-8') + except UnicodeDecodeError: + encoding = chardet.detect(json_bytes)['encoding'] + if encoding: + json_str = json_bytes.decode(encoding, 'replace') + try: + json_data = json.loads(json_str) + except json.decoder.JSONDecodeError: + pass + return json_data + + +def extract_intrinsic_metadata(dir_path: str) -> Dict: + """Given an uncompressed path holding the pkginfo file, returns a + pkginfo parsed structure as a dict. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + + dir_path (str): Path to the uncompressed directory + representing a release artifact from npm. + + Returns: + the pkginfo parsed structure as a dict if any or None if + none was present. + + """ + # Retrieve the root folder of the archive + if not os.path.exists(dir_path): + return {} + lst = os.listdir(dir_path) + if len(lst) == 0: + return {} + project_dirname = lst[0] + package_json_path = os.path.join(dir_path, project_dirname, 'package.json') + if not os.path.exists(package_json_path): + return {} + with open(package_json_path, 'rb') as package_json_file: + package_json_bytes = package_json_file.read() + return load_json(package_json_bytes) diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py new file mode 100644 index 0000000..dec6735 --- /dev/null +++ b/swh/loader/package/pypi.py @@ -0,0 +1,193 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple +from urllib.parse import urlparse +from pkginfo import UnpackedSDist + +import iso8601 + +from swh.model.identifiers import normalize_timestamp +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import api_info, release_name + + +class PyPILoader(PackageLoader): + """Load pypi origin's artifact releases into swh archive. + + """ + visit_type = 'pypi' + + def __init__(self, url): + super().__init__(url=url) + self._info = None + self.provider_url = pypi_api_url(self.url) + + @property + def info(self) -> Dict: + """Return the project metadata information (fetched from pypi registry) + + """ + if not self._info: + self._info = api_info(self.provider_url) + return self._info + + def get_versions(self) -> Sequence[str]: + return self.info['releases'].keys() + + def get_default_version(self) -> str: + return self.info['info']['version'] + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + res = [] + for meta in self.info['releases'][version]: + filename = meta['filename'] + p_info = { + 'url': meta['url'], + 'filename': filename, + 'raw': meta, + } + res.append((version, p_info)) + + if len(res) == 1: + version, p_info = res[0] + yield release_name(version), p_info + else: + for version, p_info in res: + yield release_name(version, p_info['filename']), p_info + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + sha256 = artifact_metadata['digests']['sha256'] + for rev_id, known_artifact in known_artifacts.items(): + for original_artifact in known_artifact['original_artifact']: + if sha256 == original_artifact['checksums']['sha256']: + return rev_id + + def build_revision( + self, a_metadata: Dict, uncompressed_path: str) -> Dict: + i_metadata = extract_intrinsic_metadata(uncompressed_path) + + # from intrinsic metadata + name = i_metadata['version'] + _author = author(i_metadata) + + # from extrinsic metadata + message = a_metadata.get('comment_text', '') + message = '%s: %s' % (name, message) if message else name + date = normalize_timestamp( + int(iso8601.parse_date(a_metadata['upload_time']).timestamp())) + + return { + 'type': 'tar', + 'message': message.encode('utf-8'), + 'author': _author, + 'date': date, + 'committer': _author, + 'committer_date': date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'PKG-INFO', + 'raw': i_metadata, + }, + 'extrinsic': { + 'provider': self.provider_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + } + } + + +def pypi_api_url(url: str) -> str: + """Compute api url from a project url + + Args: + url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) + This deals with correctly transforming the project's api url (e.g + https://pypi.org/pypi/requests/json) + + Returns: + api url + + """ + p_url = urlparse(url) + project_name = p_url.path.split('/')[-1] + url = '%s://%s/pypi/%s/json' % (p_url.scheme, p_url.netloc, project_name) + return url + + +def extract_intrinsic_metadata(dir_path: str) -> Dict: + """Given an uncompressed path holding the pkginfo file, returns a + pkginfo parsed structure as a dict. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + + dir_path (str): Path to the uncompressed directory + representing a release artifact from pypi. + + Returns: + the pkginfo parsed structure as a dict if any or None if + none was present. + + """ + # Retrieve the root folder of the archive + if not os.path.exists(dir_path): + return {} + lst = os.listdir(dir_path) + if len(lst) != 1: + return {} + project_dirname = lst[0] + pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') + if not os.path.exists(pkginfo_path): + return {} + pkginfo = UnpackedSDist(pkginfo_path) + raw = pkginfo.__dict__ + raw.pop('filename') # this gets added with the ondisk location + return raw + + +def author(data: Dict) -> Dict: + """Given a dict of project/release artifact information (coming from + PyPI), returns an author subset. + + Args: + data (dict): Representing either artifact information or + release information. + + Returns: + swh-model dict representing a person. + + """ + name = data.get('author') + email = data.get('author_email') + + if email: + fullname = '%s <%s>' % (name, email) + else: + fullname = name + + if not fullname: + return {'fullname': b'', 'name': None, 'email': None} + + fullname = fullname.encode('utf-8') + + if name is not None: + name = name.encode('utf-8') + + if email is not None: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py index e4fb6f6..eeeb5ea 100644 --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/tasks.py @@ -1,13 +1,43 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from celery import current_app as app -from swh.loader.package.loader import GNULoader +from celery import shared_task +from swh.loader.package.debian import DebianLoader +from swh.loader.package.deposit import DepositLoader +from swh.loader.package.npm import NpmLoader +from swh.loader.package.pypi import PyPILoader +from swh.loader.package.archive import ArchiveLoader -@app.task(name=__name__ + '.LoadGNU') -def load_gnu(name, origin_url=None, tarballs=None): - return GNULoader().load(name, origin_url, - tarballs=tarballs) + +@shared_task(name=__name__ + '.LoadArchive') +def load_archive(url=None, artifacts=None, identity_artifact_keys=None): + return ArchiveLoader(url, artifacts, + identity_artifact_keys=identity_artifact_keys).load() + + +@shared_task(name=__name__ + '.LoadDebian') +def load_debian(*, url, date, packages): + return DebianLoader(url, date, packages).load() + + +@shared_task(name=__name__ + '.LoadDeposit') +def load_deposit(*, url, deposit_id): + return DepositLoader(url, deposit_id).load() + + +@shared_task(name=__name__ + '.LoadGNU') +def load_gnu(*, url, tarballs): + return GNULoader(url, tarballs).load() + + +@shared_task(name=__name__ + '.LoadNpm') +def load_npm(*, package_name, package_url, package_metadata_url): + return NpmLoader(package_name, package_url, package_metadata_url).load() + + +@shared_task(name=__name__ + '.LoadPyPI') +def load_pypi(*, url=None): + return PyPILoader(url).load() diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py index 2d54f11..36cfc18 100644 --- a/swh/loader/package/tests/common.py +++ b/swh/loader/package/tests/common.py @@ -1,32 +1,127 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -import os.path +from os import path -RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') +import logging -package = '8sync' +from typing import Dict, List, Tuple -package_url = 'https://ftp.gnu.org/gnu/8sync/' +from swh.model.hashutil import hash_to_bytes, hash_to_hex -tarball = [{'date': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz'}] +logger = logging.getLogger(__file__) -def init_test_data(mock_tarball_request): - """Initialize the loader with the mock of the tarballs + +DATADIR = path.join(path.abspath(path.dirname(__file__)), 'resources') + + +def decode_target(target): + """Test helper to ease readability in test """ - for version in tarball: - tarball_url = version['archive'] - tarball_filename = tarball_url.split('/')[-1] - tarball_filepath = os.path.join(RESOURCES_PATH, 'tarballs', - tarball_filename) - with open(tarball_filepath, mode='rb') as tarball_file: - tarball_content = tarball_file.read() - mock_tarball_request.get( - tarball_url, content=tarball_content, - headers={'content-length': str(len(tarball_content))}) + if not target: + return target + target_type = target['target_type'] + + if target_type == 'alias': + decoded_target = target['target'].decode('utf-8') + else: + decoded_target = hash_to_hex(target['target']) + + return { + 'target': decoded_target, + 'target_type': target_type + } + + +def check_snapshot(expected_snapshot, storage): + """Check for snapshot match. + + Provide the hashes as hexadecimal, the conversion is done + within the method. + + Args: + expected_snapshot (dict): full snapshot with hex ids + storage (Storage): expected storage + + """ + expected_snapshot_id = expected_snapshot['id'] + expected_branches = expected_snapshot['branches'] + snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) + if snap is None: + # display known snapshots instead if possible + if hasattr(storage, '_snapshots'): # in-mem storage + from pprint import pprint + for snap_id, (_snap, _) in storage._snapshots.items(): + snapd = _snap.to_dict() + snapd['id'] = hash_to_hex(snapd['id']) + branches = { + branch.decode('utf-8'): decode_target(target) + for branch, target in snapd['branches'].items() + } + snapd['branches'] = branches + pprint(snapd) + raise AssertionError('Snapshot is not found') + + branches = { + branch.decode('utf-8'): decode_target(target) + for branch, target in snap['branches'].items() + } + assert expected_branches == branches + + +def check_metadata(metadata: Dict, key_path: str, raw_type: str): + """Given a metadata dict, ensure the associated key_path value is of type + raw_type. + + Args: + metadata: Dict to check + key_path: Path to check + raw_type: Type to check the path with + + Raises: + Assertion error in case of mismatch + + """ + data = metadata + keys = key_path.split('.') + for k in keys: + try: + data = data[k] + except (TypeError, KeyError) as e: + # KeyError: because path too long + # TypeError: data is not a dict + raise AssertionError(e) + assert isinstance(data, raw_type) + + +def check_metadata_paths(metadata: Dict, paths: List[Tuple[str, str]]): + """Given a metadata dict, ensure the keys are of expected types + + Args: + metadata: Dict to check + key_path: Path to check + raw_type: Type to check the path with + + Raises: + Assertion error in case of mismatch + + """ + for key_path, raw_type in paths: + check_metadata(metadata, key_path, raw_type) + + +def get_stats(storage) -> Dict: + """Adaptation utils to unify the stats counters across storage + implementation. + + """ + storage.refresh_stat_counters() + stats = storage.stat_counters() + + keys = ['content', 'directory', 'origin', 'origin_visit', 'person', + 'release', 'revision', 'skipped_content', 'snapshot'] + return {k: stats.get(k) for k in keys} diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py new file mode 100644 index 0000000..f80142f --- /dev/null +++ b/swh/loader/package/tests/conftest.py @@ -0,0 +1,51 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import pytest +import yaml + +from swh.storage.tests.conftest import * # noqa +from swh.scheduler.tests.conftest import * # noqa + + +@pytest.fixture +def swh_config(monkeypatch, swh_storage_postgresql, tmp_path): + storage_config = { + 'storage': { + 'cls': 'local', + 'args': { + 'db': swh_storage_postgresql.dsn, + 'objstorage': { + 'cls': 'memory', + 'args': {} + }, + }, + }, + 'url': 'https://deposit.softwareheritage.org/1/private', + } + + conffile = os.path.join(str(tmp_path), 'loader.yml') + with open(conffile, 'w') as f: + f.write(yaml.dump(storage_config)) + monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) + return conffile + + +@pytest.fixture(autouse=True, scope='session') +def swh_proxy(): + """Automatically inject this fixture in all tests to ensure no outside + connection takes place. + + """ + os.environ['http_proxy'] = 'http://localhost:999' + os.environ['https_proxy'] = 'http://localhost:999' + + +@pytest.fixture(scope='session') # type: ignore # expected redefinition +def celery_includes(): + return [ + 'swh.loader.package.tasks', + ] diff --git a/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz new file mode 100644 index 0000000..834ac91 Binary files /dev/null and b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz differ diff --git a/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc new file mode 100644 index 0000000..1f94b20 --- /dev/null +++ b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc @@ -0,0 +1,44 @@ +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +Format: 1.0 +Source: cicero +Binary: cicero +Architecture: all +Version: 0.7.2-3 +Maintainer: Debian Accessibility Team +Uploaders: Samuel Thibault +Homepage: http://pages.infinit.net/fluxnic/cicero/ +Standards-Version: 3.9.6 +Vcs-Browser: http://git.debian.org/?p=pkg-a11y/cicero.git;a=summary +Vcs-Git: git://git.debian.org/git/pkg-a11y/cicero.git +Build-Depends: debhelper (>= 7) +Package-List: + cicero deb contrib/sound optional arch=all +Checksums-Sha1: + a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43 96527 cicero_0.7.2.orig.tar.gz + 0815282053f21601b0ec4adf7a8fe47eace3c0bc 3964 cicero_0.7.2-3.diff.gz +Checksums-Sha256: + 63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786 96527 cicero_0.7.2.orig.tar.gz + f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c 3964 cicero_0.7.2-3.diff.gz +Files: + 4353dede07c5728319ba7f5595a7230a 96527 cicero_0.7.2.orig.tar.gz + a93661b6a48db48d59ba7d26796fc9ce 3964 cicero_0.7.2-3.diff.gz + +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1 + +iQIcBAEBCgAGBQJUQ9GjAAoJEBH0lP5vGG7NTFMQAIk5Wkicp5/GQOfkFh5qT7X7 +cKd98i/7t/0HznGCPv1iaQwsky5wbdqimMaW+vnKWEj8P2AEOLmyfGAjAKGSj0yW +r28dB0+vaiy1rFbtuTL+AVrtO2b/uVuh9eA2ZhDgLekv//bSzpMorIF+uqdQS18d +x2y9ZyKOucVPc+ARTcTrOmPbKR7ywIZEaj3E0Lq5p1e50BkqHVbZzzM7dMZuyatH +FcTsoCjz9kiulGx4LGzItajMBOdA2lIK4TlBRsO6wApOIvOnhSEQr5CqwbVwzwGv +N//8EoiNbs5bpweOGXOLN/RzvRPaEp/8W5P+E7jKyyiGkBeBrQeDlJA5cqBXcz1G +63zVmLyp3AYDrRaQ1AvgUyoL91mQIsDwc2gwT3YRYc4TE4HtYCAD85e/NGCAG5mk +vy+WH6NaaU6mb17IN7V+mGgb/l5pgwPADP4VaFugjrZK7nJp6I2xK2FmgDlGw8gj +qC2LUVuI/ijxTkxS9KdGSwtF4YLw6hbhUIv+19n5ajJ8MpTniv3hPiG4ZYY0qc7j +oejiRGszAR9syTjPKHhYpBnKwTVg8dkaOI+Jw+uwlK5W0opKoDt4Kr4ceCxuxsvU +L1I0MtaTGsGABJTX6utGvklYROApAcqMzGYozNeYOuGlWpvBx5QqdTmo6yv515cq +vWwMF6ldOni8Da5B/7Q9 +=XtIw +-----END PGP SIGNATURE----- diff --git a/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz new file mode 100644 index 0000000..71726d2 Binary files /dev/null and b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz differ diff --git a/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc new file mode 100644 index 0000000..e5cd3ff --- /dev/null +++ b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc @@ -0,0 +1,43 @@ +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +Format: 1.0 +Source: cicero +Binary: cicero +Architecture: all +Version: 0.7.2-4 +Maintainer: Debian Accessibility Team +Uploaders: Samuel Thibault +Homepage: http://pages.infinit.net/fluxnic/cicero/ +Standards-Version: 4.1.4 +Vcs-Browser: https://anonscm.debian.org/git/pkg-a11y/cicero.git +Vcs-Git: https://anonscm.debian.org/git/pkg-a11y/cicero.git +Build-Depends: debhelper (>= 7) +Package-List: + cicero deb contrib/sound optional arch=all +Checksums-Sha1: + a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43 96527 cicero_0.7.2.orig.tar.gz + d21527f61e4ea81398337e4f20314bd6e72b48e3 4038 cicero_0.7.2-4.diff.gz +Checksums-Sha256: + 63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786 96527 cicero_0.7.2.orig.tar.gz + 2e6fa296ee7005473ff58d0971f4fd325617b445671480e9f2cfb738d5dbcd01 4038 cicero_0.7.2-4.diff.gz +Files: + 4353dede07c5728319ba7f5595a7230a 96527 cicero_0.7.2.orig.tar.gz + 1e7e6fc4a59d57c98082a3af78145734 4038 cicero_0.7.2-4.diff.gz + +-----BEGIN PGP SIGNATURE----- + +iQIzBAEBCgAdFiEEmjc9NmSo3GLaCjT9nlEeAcc38HUFAlrklRgACgkQnlEeAcc3 +8HXJBQ//XE8KG5H+XuJIYHIPv0MRKx3b8A5PUeyW3k2NRo7a70bRuGhe/xVtKr+B +OUSluKAYVcpATHLrJi0PdrC2RQ8E4ck25g8alW+3Dvi5YbMPjeg9dSdBk6kuxWO0 +64L1o4rfuyY5LE+fGVZ0nmSHak9apIJ9SP3Bgg0OodiFaqIurxXwOGI60jcp47Oy +sS+joZUziLY81SIkvx8GkMKzw1PW/k4Jo4L7S34iYMuMG/FcsUeHx1/8DdDoYKui +DihTifecPvBlHDs/7kFqdyASXSF+ilKx0SIUWsdkzzgIhIzO8fZ/vXEwMBaDfxum +uMQXg2KWW0TY/zRqPsgsfXRrCa1nwbxXJNf/YluNYWQ6uWd3KMEjJ71vHSkb3FKG +x1naDozDEeM0Sed1mT9eKqr/tfFl1NG6liJy3b8F8HXJiJrRp1ToGBqtL5VXz53z +3ssKcb/UxIhNujT4o7WGgiP+jWj/CIcYdsJ9keM4mA2FBzeaObz1scWi539JrdRA +oqnj7xVfWGAZAg0Ozce+7rKPQqACMB9vMzHJ0NpZZvIYUHSz9AwGvVoQKYc0CE/r +qFjVRcASNKHAAg+l4wv9n+zOdFUTeOa3hTxCHQAhSXnCWfw4zfsJQK0Ntbrcg94V +8IfnmEtJXEJwQWmEn17zEerEDEbc5+KqLApqwnYol5rulR1VNi4= +=VcXs +-----END PGP SIGNATURE----- diff --git a/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz new file mode 100644 index 0000000..aa0a389 Binary files /dev/null and b/swh/loader/package/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz differ diff --git a/swh/loader/package/tests/data/http_deb.debian.org/onefile.txt b/swh/loader/package/tests/data/http_deb.debian.org/onefile.txt new file mode 100644 index 0000000..1d62cd2 --- /dev/null +++ b/swh/loader/package/tests/data/http_deb.debian.org/onefile.txt @@ -0,0 +1 @@ +This is a file to retrieve information from in a test context diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_meta b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_meta new file mode 120000 index 0000000..1c75198 --- /dev/null +++ b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_meta @@ -0,0 +1 @@ +hello_2.10.json \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_raw b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_raw new file mode 120000 index 0000000..51b4a7a --- /dev/null +++ b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_raw @@ -0,0 +1 @@ +hello_2.10.orig.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.json new file mode 100644 index 0000000..cab9c3b --- /dev/null +++ b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.json @@ -0,0 +1,80 @@ +{ + "origin": { + "url": "https://hal-test.archives-ouvertes.fr/some-external-id", + "type": "deposit" + }, + "origin_metadata": { + "metadata": { + "@xmlns": [ + "http://www.w3.org/2005/Atom" + ], + "author": [ + "some awesome author", + "another one", + "no one" + ], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "external_identifier": "some-external-id", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" + } + } + }, + "revision": { + "synthetic": true, + "committer_date": { + "timestamp": { + "seconds": 1507389428, + "microseconds": 0 + }, + "offset": 0, + "negative_utc": false + }, + "message": "hal: Deposit 666 in collection hal", + "author": { + "name": "Software Heritage", + "fullname": "Software Heritage", + "email": "robot@softwareheritage.org" + }, + "committer": { + "name": "Software Heritage", + "fullname": "Software Heritage", + "email": "robot@softwareheritage.org" + }, + "date": { + "timestamp": { + "seconds": 1507389428, + "microseconds": 0 + }, + "offset": 0, + "negative_utc": false + }, + "metadata": { + "@xmlns": [ + "http://www.w3.org/2005/Atom" + ], + "author": [ + "some awesome author", + "another one", + "no one" + ], + "external_identifier": "some-external-id", + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "type": "tar", + "parents": [] + }, + "branch_name": "master" +} diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz new file mode 100644 index 0000000..cae6b33 Binary files /dev/null and b/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz new file mode 100644 index 0000000..5f5fcb1 Binary files /dev/null and b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip new file mode 100644 index 0000000..4b8f135 Binary files /dev/null and b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip new file mode 100644 index 0000000..8638d33 Binary files /dev/null and b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip new file mode 100644 index 0000000..3fa6c3a Binary files /dev/null and b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip new file mode 100644 index 0000000..316ced2 Binary files /dev/null and b/swh/loader/package/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip differ diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz b/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz new file mode 120000 index 0000000..5e1851b --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz @@ -0,0 +1 @@ +0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip new file mode 120000 index 0000000..e4b08b9 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip @@ -0,0 +1 @@ +0805nexter-1.1.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip new file mode 120000 index 0000000..beeb0f1 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip @@ -0,0 +1 @@ +0805nexter-1.3.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip new file mode 120000 index 0000000..58026f3 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip @@ -0,0 +1 @@ +0805nexter-1.2.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 new file mode 120000 index 0000000..58026f3 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 @@ -0,0 +1 @@ +0805nexter-1.2.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip new file mode 120000 index 0000000..e4b08b9 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip @@ -0,0 +1 @@ +0805nexter-1.1.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 new file mode 120000 index 0000000..e4b08b9 --- /dev/null +++ b/swh/loader/package/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 @@ -0,0 +1 @@ +0805nexter-1.1.0.zip \ No newline at end of file diff --git a/swh/loader/package/tests/resources/tarballs/8sync-0.1.0.tar.gz b/swh/loader/package/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz similarity index 100% rename from swh/loader/package/tests/resources/tarballs/8sync-0.1.0.tar.gz rename to swh/loader/package/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz diff --git a/swh/loader/package/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz b/swh/loader/package/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz new file mode 100644 index 0000000..ad9cbfa Binary files /dev/null and b/swh/loader/package/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json b/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json new file mode 100644 index 0000000..357bf16 --- /dev/null +++ b/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json @@ -0,0 +1,95 @@ +{ + "info": { + "author": "hgtkpython", + "author_email": "2868989685@qq.com", + "bugtrack_url": null, + "classifiers": [], + "description": "UNKNOWN", + "description_content_type": null, + "docs_url": null, + "download_url": "UNKNOWN", + "downloads": { + "last_day": -1, + "last_month": -1, + "last_week": -1 + }, + "home_page": "http://www.hp.com", + "keywords": null, + "license": "UNKNOWN", + "maintainer": null, + "maintainer_email": null, + "name": "0805nexter", + "package_url": "https://pypi.org/project/0805nexter/", + "platform": "UNKNOWN", + "project_url": "https://pypi.org/project/0805nexter/", + "project_urls": { + "Download": "UNKNOWN", + "Homepage": "http://www.hp.com" + }, + "release_url": "https://pypi.org/project/0805nexter/1.2.0/", + "requires_dist": null, + "requires_python": null, + "summary": "a simple printer of nested lest", + "version": "1.2.0" + }, + "last_serial": 1931736, + "releases": { + "1.1.0": [ + { + "comment_text": "", + "digests": { + "md5": "07fc93fc12821c1405c3483db88154af", + "sha256": "52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035" + }, + "downloads": -1, + "filename": "0805nexter-1.1.0.zip", + "has_sig": false, + "md5_digest": "07fc93fc12821c1405c3483db88154af", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 862, + "upload_time": "2016-01-31T05:28:42", + "url": "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip" + } + ], + "1.2.0": [ + { + "comment_text": "", + "digests": { + "md5": "89123c78bd5d3f61cb8f46029492b18a", + "sha256": "49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709" + }, + "downloads": -1, + "filename": "0805nexter-1.2.0.zip", + "has_sig": false, + "md5_digest": "89123c78bd5d3f61cb8f46029492b18a", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 898, + "upload_time": "2016-01-31T05:51:25", + "url": "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip" + } + ] + }, + "urls": [ + { + "comment_text": "", + "digests": { + "md5": "89123c78bd5d3f61cb8f46029492b18a", + "sha256": "49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709" + }, + "downloads": -1, + "filename": "0805nexter-1.2.0.zip", + "has_sig": false, + "md5_digest": "89123c78bd5d3f61cb8f46029492b18a", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 898, + "upload_time": "2016-01-31T05:51:25", + "url": "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip" + } + ] +} diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 b/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 new file mode 100644 index 0000000..fbeb488 --- /dev/null +++ b/swh/loader/package/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 @@ -0,0 +1,114 @@ +{ + "info": { + "author": "hgtkpython", + "author_email": "2868989685@qq.com", + "bugtrack_url": null, + "classifiers": [], + "description": "UNKNOWN", + "description_content_type": null, + "docs_url": null, + "download_url": "UNKNOWN", + "downloads": { + "last_day": -1, + "last_month": -1, + "last_week": -1 + }, + "home_page": "http://www.hp.com", + "keywords": null, + "license": "UNKNOWN", + "maintainer": null, + "maintainer_email": null, + "name": "0805nexter", + "package_url": "https://pypi.org/project/0805nexter/", + "platform": "UNKNOWN", + "project_url": "https://pypi.org/project/0805nexter/", + "project_urls": { + "Download": "UNKNOWN", + "Homepage": "http://www.hp.com" + }, + "release_url": "https://pypi.org/project/0805nexter/1.3.0/", + "requires_dist": null, + "requires_python": null, + "summary": "a simple printer of nested lest", + "version": "1.3.0" + }, + "last_serial": 1931736, + "releases": { + "1.1.0": [ + { + "comment_text": "", + "digests": { + "md5": "07fc93fc12821c1405c3483db88154af", + "sha256": "52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035" + }, + "downloads": -1, + "filename": "0805nexter-1.1.0.zip", + "has_sig": false, + "md5_digest": "07fc93fc12821c1405c3483db88154af", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 862, + "upload_time": "2016-01-31T05:28:42", + "url": "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip" + } + ], + "1.2.0": [ + { + "comment_text": "", + "digests": { + "md5": "89123c78bd5d3f61cb8f46029492b18a", + "sha256": "49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709" + }, + "downloads": -1, + "filename": "0805nexter-1.2.0.zip", + "has_sig": false, + "md5_digest": "89123c78bd5d3f61cb8f46029492b18a", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 898, + "upload_time": "2016-01-31T05:51:25", + "url": "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip" + } + ], + "1.3.0": [ + { + "comment_text": "Made up release 1.3.0 for swh-loader-pypi purposes", + "digests": { + "md5": "54d9750a1ab7ab82cd8c460c2c6c0ecc", + "sha256": "7097c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1" + }, + "downloads": -1, + "filename": "0805nexter-1.3.0.zip", + "has_sig": false, + "md5_digest": "54d9750a1ab7ab82cd8c460c2c6c0ecc", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 1370, + "upload_time": "2018-09-17T16:18:01", + "url": "https://files.pythonhosted.org/packages/70/97/c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1/0805nexter-1.3.0.zip" + } + ] + }, + "urls": [ + { + "comment_text": "", + "digests": { + "md5": "89123c78bd5d3f61cb8f46029492b18a", + "sha256": "49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709" + }, + "downloads": -1, + "filename": "0805nexter-1.2.0.zip", + "has_sig": false, + "md5_digest": "89123c78bd5d3f61cb8f46029492b18a", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 898, + "upload_time": "2016-01-31T05:51:25", + "url": "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip" + } + ] +} diff --git a/swh/loader/package/tests/data/https_pypi.org/pypi_nexter_json b/swh/loader/package/tests/data/https_pypi.org/pypi_nexter_json new file mode 100644 index 0000000..8e0f1cd --- /dev/null +++ b/swh/loader/package/tests/data/https_pypi.org/pypi_nexter_json @@ -0,0 +1,74 @@ +{ + "info": { + "author": "hgtkpython", + "author_email": "2868989685@qq.com", + "bugtrack_url": null, + "classifiers": [], + "description": "UNKNOWN", + "description_content_type": null, + "docs_url": null, + "download_url": "UNKNOWN", + "downloads": { + "last_day": -1, + "last_month": -1, + "last_week": -1 + }, + "home_page": "http://www.hp.com", + "keywords": null, + "license": "UNKNOWN", + "maintainer": null, + "maintainer_email": null, + "name": "nexter", + "package_url": "https://pypi.org/project/nexter/", + "platform": "UNKNOWN", + "project_url": "https://pypi.org/project/nexter/", + "project_urls": { + "Download": "UNKNOWN", + "Homepage": "http://www.hp.com" + }, + "release_url": "https://pypi.org/project/nexter/1.1.0/", + "requires_dist": null, + "requires_python": null, + "summary": "a simple printer of nested lest", + "version": "1.1.0" + }, + "last_serial": 1931736, + "releases": { + "1.1.0": [ + { + "comment_text": "", + "digests": { + "md5": "07fc93fc12821c1405c3483db88154af", + "sha256": "52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035" + }, + "downloads": -1, + "filename": "nexter-1.1.0.zip", + "has_sig": false, + "md5_digest": "07fc93fc12821c1405c3483db88154af", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 862, + "upload_time": "2016-01-31T05:28:42", + "url": "https://files.pythonhosted.org/nexter-1.1.0.zip" + }, + { + "comment_text": "", + "digests": { + "md5": "6a855ad907c849c4655db102f86b159f", + "sha256": "9e574473374413c85224f532443a3063bc974dd2f43d33afccee39c81006e906" + }, + "downloads": -1, + "filename": "nexter-1.1.0.tar.gz", + "has_sig": false, + "md5_digest": "6a855ad907c849c4655db102f86b159f", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 584, + "upload_time": "2016-01-31T05:51:25", + "url": "https://files.pythonhosted.org/nexter-1.1.0.tar.gz" + } + ] + } +} diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz new file mode 100644 index 0000000..b726261 Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz new file mode 100644 index 0000000..bc20daa Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz new file mode 100644 index 0000000..a431eeb Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz new file mode 100644 index 0000000..8381a52 Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz new file mode 100644 index 0000000..738c28a Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz new file mode 100644 index 0000000..544bf08 Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz differ diff --git a/swh/loader/package/tests/data/https_replicate.npmjs.com/org b/swh/loader/package/tests/data/https_replicate.npmjs.com/org new file mode 100644 index 0000000..3aba6b1 --- /dev/null +++ b/swh/loader/package/tests/data/https_replicate.npmjs.com/org @@ -0,0 +1,191 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.4" + }, + "versions": { + "0.0.2": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.2", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.2", + "dist": { + "shasum": "12c58092e7de94456a43ef7823eef45e4d1d12fe", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.2.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.4": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.4", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.4", + "dist": { + "shasum": "788b3be1a50f7c94c1500ae4d922ec76c04e06ea", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.4.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44.220Z", + "created": "2014-01-01T15:40:31.231Z", + "0.0.2": "2014-01-01T15:40:33.020Z", + "0.0.3": "2014-01-01T15:55:45.497Z", + "0.0.4": "2014-01-02T06:10:26.485Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/tests/data/https_replicate.npmjs.com/org_visit1 b/swh/loader/package/tests/data/https_replicate.npmjs.com/org_visit1 new file mode 100644 index 0000000..d0e56b7 --- /dev/null +++ b/swh/loader/package/tests/data/https_replicate.npmjs.com/org_visit1 @@ -0,0 +1,347 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.2.0" + }, + "versions": { + "0.0.2": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.2", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.2", + "dist": { + "shasum": "12c58092e7de94456a43ef7823eef45e4d1d12fe", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.2.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.4": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.4", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.4", + "dist": { + "shasum": "788b3be1a50f7c94c1500ae4d922ec76c04e06ea", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.4.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.5": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.5", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.5", + "dist": { + "shasum": "66e8c316cb37e1c176f604aa53fcb07b6f51b908", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.5.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.1.0": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.1.0", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "gitHead": "82ed089de208e82a3bf3463f52fa77006382674c", + "_id": "org@0.1.0", + "scripts": {}, + "_shasum": "bfaab735973c1a88fd62a21faf527ce360a412e9", + "_from": ".", + "_npmVersion": "1.4.28", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "dist": { + "shasum": "bfaab735973c1a88fd62a21faf527ce360a412e9", + "tarball": "https://registry.npmjs.org/org/-/org-0.1.0.tgz" + } + }, + "0.2.0": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.2.0", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "gitHead": "57b4480747e6e0c26baa43f267218a34b59224a5", + "_id": "org@0.2.0", + "scripts": {}, + "_shasum": "d76378387dc506fb8e3ccff73a0ad04e3afc6391", + "_from": ".", + "_npmVersion": "1.4.28", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "dist": { + "shasum": "d76378387dc506fb8e3ccff73a0ad04e3afc6391", + "tarball": "https://registry.npmjs.org/org/-/org-0.2.0.tgz" + } + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44.220Z", + "created": "2014-01-01T15:40:31.231Z", + "0.0.2": "2014-01-01T15:40:33.020Z", + "0.0.3": "2014-01-01T15:55:45.497Z", + "0.0.4": "2014-01-02T06:10:26.485Z", + "0.0.5": "2014-01-03T13:58:20.540Z", + "0.1.0": "2014-11-23T03:47:12.464Z", + "0.2.0": "2015-02-21T07:14:47.785Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/tests/test_archive.py b/swh/loader/package/tests/test_archive.py new file mode 100644 index 0000000..d128a44 --- /dev/null +++ b/swh/loader/package/tests/test_archive.py @@ -0,0 +1,344 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model.hashutil import hash_to_bytes + +from swh.loader.package.archive import ArchiveLoader, artifact_identity +from swh.loader.package.tests.common import ( + check_snapshot, check_metadata_paths, get_stats +) + + +URL = 'https://ftp.gnu.org/gnu/8sync/' +GNU_ARTIFACTS = [ + { + 'time': 944729610, + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } +] + +_expected_new_contents_first_visit = [ + 'e9258d81faf5881a2f96a77ba609396f82cb97ad', + '1170cf105b04b7e2822a0e09d2acf71da7b9a130', + 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', + '0057bec9b5422aff9256af240b177ac0e3ac2608', + '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', + '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', + '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', + 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', + 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', + 'd64e64d4c73679323f8d4cde2643331ba6c20af9', + '7a756602914be889c0a2d3952c710144b3e64cb0', + '84fb589b554fcb7f32b806951dcf19518d67b08f', + '8624bcdae55baeef00cd11d5dfcfa60f68710a02', + 'e08441aeab02704cfbd435d6445f7c072f8f524e', + 'f67935bc3a83a67259cda4b2d43373bd56703844', + '809788434b433eb2e3cfabd5d591c9a659d5e3d8', + '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', + 'b99fec102eb24bffd53ab61fc30d59e810f116a2', + '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', + 'f0c97052e567948adf03e641301e9983c478ccff', + '7fb724242e2b62b85ca64190c31dcae5303e19b3', + '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', + '7350628ccf194c2c3afba4ac588c33e3f3ac778d', + '0bb892d9391aa706dc2c3b1906567df43cbe06a2', + '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', + '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', + '3046e5d1f70297e2a507b98224b6222c9688d610', + '1572607d456d7f633bc6065a2b3048496d679a31', +] + +_expected_new_directories_first_visit = [ + 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', + '263be23b4a8101d3ad0d9831319a3e0f2b065f36', + '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', + '4db0a3ecbc976083e2dac01a62f93729698429a3', + 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', + 'eca971d346ea54d95a6e19d5051f900237fafdaa', + '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', +] + +_expected_new_revisions_first_visit = { + '44183488c0774ce3c957fa19ba695cf18a4a42b3': + '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' +} + +_expected_branches_first_visit = { + 'HEAD': { + 'target_type': 'alias', + 'target': 'releases/0.1.0', + }, + 'releases/0.1.0': { + 'target_type': 'revision', + 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', + }, +} + +# hash is different then before as we changed the snapshot +# gnu used to use `release/` (singular) instead of plural +_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa + + +def visit_with_no_artifact_found(swh_config, requests_mock_datadir): + url = URL + unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz' + loader = ArchiveLoader(url, artifacts=[ + { + 'time': 944729610, + 'url': unknown_artifact_url, # unknown artifact + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + ]) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'uneventful' + stats = get_stats(loader.storage) + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + expected_revision_id = hash_to_bytes( + '44183488c0774ce3c957fa19ba695cf18a4a42b3') + revision = list(loader.storage.revision_get([expected_revision_id]))[0] + + assert revision is not None + + check_metadata_paths(revision['metadata'], paths=[ + ('intrinsic', dict), + ('extrinsic.provider', str), + ('extrinsic.when', str), + ('extrinsic.raw', dict), + ('original_artifact', list), + ]) + + for original_artifact in revision['metadata']['original_artifact']: + check_metadata_paths(original_artifact, paths=[ + ('filename', str), + ('length', int), + ('checksums', dict), + ]) + + +def test_visit_with_release_artifact_no_prior_visit( + swh_config, requests_mock_datadir): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) + assert list(loader.storage.content_missing_per_sha1(expected_contents)) \ + == [] + + expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) + assert list(loader.storage.revision_missing(expected_revs)) == [] + + expected_snapshot = { + 'id': _expected_new_snapshot_first_visit_id, + 'branches': _expected_branches_first_visit, + } + + check_snapshot(expected_snapshot, loader.storage) + + +def test_2_visits_without_change(swh_config, requests_mock_datadir): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + url = URL + loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + assert len(urls) == 1 + + +def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + url = URL + artifact1 = GNU_ARTIFACTS[0] + loader = ArchiveLoader(url, [artifact1]) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + stats = get_stats(loader.storage) + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + assert len(urls) == 1 + + artifact2 = { + 'time': 1480991830, + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'length': 238466, + 'filename': '8sync-0.2.0.tar.gz', + 'version': '0.2.0', + } + + loader2 = ArchiveLoader(url, [artifact1, artifact2]) + # implementation detail: share the storage in between visits + loader2.storage = loader.storage + stats2 = get_stats(loader2.storage) + assert stats == stats2 # ensure we share the storage + + actual_load_status2 = loader2.load() + assert actual_load_status2['status'] == 'eventful' + + stats2 = get_stats(loader.storage) + assert { + 'content': len(_expected_new_contents_first_visit) + 14, + 'directory': len(_expected_new_directories_first_visit) + 8, + 'origin': 1, + 'origin_visit': 1 + 1, + 'person': 1, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit) + 1, + 'skipped_content': 0, + 'snapshot': 1 + 1, + } == stats2 + + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + # 1 artifact (2nd time no modification) + 1 new artifact + assert len(urls) == 2 + + +def test_artifact_identity(): + """Compute primary key should return the right identity + + """ + data = { + 'a': 1, + 'b': 2, + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + + for id_keys, expected_id in [ + (['a', 'b'], [1, 2]), + ([], []), + (['a', 'key-that-does-not-exist'], [1, None]) + ]: + actual_id = artifact_identity(data, id_keys=id_keys) + assert actual_id == expected_id + + +def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): + """Load a project archive (not gnu) ends up with 1 snapshot + + """ + url = 'https://something.else.org/8sync/' + artifacts = [ # this is not a gnu artifact + { + 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp + 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa + # keep a gnu artifact reference to avoid adding other test files + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'length': 238466, + 'filename': '8sync-0.2.0.tar.gz', + 'version': '0.2.0', + } + ] + + # Here the loader defines the id_keys to use for existence in the snapshot + # It's not the default archive loader which + loader = ArchiveLoader( + url, artifacts=artifacts, identity_artifact_keys=[ + 'sha256', 'length', 'url']) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + assert len(urls) == 1 diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py new file mode 100644 index 0000000..95a5a9a --- /dev/null +++ b/swh/loader/package/tests/test_common.py @@ -0,0 +1,175 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.model.hashutil import hash_to_bytes +from swh.loader.package.tests.common import ( + decode_target, check_snapshot, check_metadata, check_metadata_paths +) +from swh.storage import get_storage + +hash_hex = '43e45d56f88993aae6a0198013efa80716fd8920' + + +def test_decode_target_edge(): + assert not decode_target(None) + + +def test_decode_target(): + actual_alias_decode_target = decode_target({ + 'target_type': 'alias', + 'target': b'something', + }) + + assert actual_alias_decode_target == { + 'target_type': 'alias', + 'target': 'something', + } + + actual_decode_target = decode_target({ + 'target_type': 'revision', + 'target': hash_to_bytes(hash_hex), + }) + + assert actual_decode_target == { + 'target_type': 'revision', + 'target': hash_hex, + } + + +def test_check_snapshot(): + storage = get_storage(cls='memory', args={}) + + snap_id = '2498dbf535f882bc7f9a18fb16c9ad27fda7bab7' + snapshot = { + 'id': hash_to_bytes(snap_id), + 'branches': { + b'master': { + 'target': hash_to_bytes(hash_hex), + 'target_type': 'revision', + }, + }, + } + + s = storage.snapshot_add([snapshot]) + assert s == { + 'snapshot:add': 1, + } + + expected_snapshot = { + 'id': snap_id, + 'branches': { + 'master': { + 'target': hash_hex, + 'target_type': 'revision', + } + } + } + check_snapshot(expected_snapshot, storage) + + +def test_check_snapshot_failure(): + storage = get_storage(cls='memory', args={}) + + snapshot = { + 'id': hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'), + 'branches': { + b'master': { + 'target': hash_to_bytes(hash_hex), + 'target_type': 'revision', + }, + }, + } + + s = storage.snapshot_add([snapshot]) + assert s == { + 'snapshot:add': 1, + } + + unexpected_snapshot = { + 'id': '2498dbf535f882bc7f9a18fb16c9ad27fda7bab7', + 'branches': { + 'master': { + 'target': hash_hex, + 'target_type': 'release', # wrong value + } + } + } + + with pytest.raises(AssertionError): + check_snapshot(unexpected_snapshot, storage) + + +def test_check_metadata(): + metadata = { + 'a': { + 'raw': { + 'time': 'something', + }, + }, + 'b': [], + 'c': 1, + } + + for raw_path, raw_type in [ + ('a.raw', dict), + ('a.raw.time', str), + ('b', list), + ('c', int), + ]: + check_metadata(metadata, raw_path, raw_type) + + +def test_check_metadata_ko(): + metadata = { + 'a': { + 'raw': 'hello', + }, + 'b': [], + 'c': 1, + } + + for raw_path, raw_type in [ + ('a.b', dict), + ('a.raw.time', str), + ]: + with pytest.raises(AssertionError): + check_metadata(metadata, raw_path, raw_type) + + +def test_check_metadata_paths(): + metadata = { + 'a': { + 'raw': { + 'time': 'something', + }, + }, + 'b': [], + 'c': 1, + } + + check_metadata_paths(metadata, [ + ('a.raw', dict), + ('a.raw.time', str), + ('b', list), + ('c', int), + ]) + + +def test_check_metadata_paths_ko(): + metadata = { + 'a': { + 'raw': 'hello', + }, + 'b': [], + 'c': 1, + } + + with pytest.raises(AssertionError): + check_metadata_paths(metadata, [ + ('a.b', dict), + ('a.raw.time', str), + ]) diff --git a/swh/loader/package/tests/test_conftest.py b/swh/loader/package/tests/test_conftest.py new file mode 100644 index 0000000..d9d16e0 --- /dev/null +++ b/swh/loader/package/tests/test_conftest.py @@ -0,0 +1,12 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import requests + + +def test_swh_proxy(): + with pytest.raises(requests.exceptions.ProxyError): + requests.get('https://www.softwareheritage.org') diff --git a/swh/loader/package/tests/test_debian.py b/swh/loader/package/tests/test_debian.py new file mode 100644 index 0000000..2b632be --- /dev/null +++ b/swh/loader/package/tests/test_debian.py @@ -0,0 +1,371 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import copy +import logging +import pytest + +from os import path + +from swh.loader.package.debian import ( + DebianLoader, download_package, dsc_information, uid_to_person, + prepare_person, get_package_metadata, extract_package +) +from swh.loader.package.tests.common import check_snapshot, get_stats + + +logger = logging.getLogger(__name__) + + +PACKAGE_FILES = { + 'name': 'cicero', + 'version': '0.7.2-3', + 'files': { + 'cicero_0.7.2-3.diff.gz': { + 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', + 'name': 'cicero_0.7.2-3.diff.gz', + 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa + 'size': 3964, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa + }, + 'cicero_0.7.2-3.dsc': { + 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', + 'name': 'cicero_0.7.2-3.dsc', + 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa + 'size': 1864, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa + 'cicero_0.7.2.orig.tar.gz': { + 'md5sum': '4353dede07c5728319ba7f5595a7230a', + 'name': 'cicero_0.7.2.orig.tar.gz', + 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa + 'size': 96527, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa + } + }, +} + +PACKAGE_FILES2 = { + 'name': 'cicero', + 'version': '0.7.2-4', + 'files': { + 'cicero_0.7.2-4.diff.gz': { + 'md5sum': '1e7e6fc4a59d57c98082a3af78145734', + 'name': 'cicero_0.7.2-4.diff.gz', + 'sha256': '2e6fa296ee7005473ff58d0971f4fd325617b445671480e9f2cfb738d5dbcd01', # noqa + 'size': 4038, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.diff.gz' # noqa + }, + 'cicero_0.7.2-4.dsc': { + 'md5sum': '1a6c8855a73b4282bb31d15518f18cde', + 'name': 'cicero_0.7.2-4.dsc', + 'sha256': '913ee52f7093913420de5cbe95d63cfa817f1a1daf997961149501894e754f8b', # noqa + 'size': 1881, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.dsc'}, # noqa + 'cicero_0.7.2.orig.tar.gz': { + 'md5sum': '4353dede07c5728319ba7f5595a7230a', + 'name': 'cicero_0.7.2.orig.tar.gz', + 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa + 'size': 96527, + 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa + } + } +} + + +PACKAGE_PER_VERSION = { + 'stretch/contrib/0.7.2-3': PACKAGE_FILES, +} + + +PACKAGES_PER_VERSION = { + 'stretch/contrib/0.7.2-3': PACKAGE_FILES, + 'buster/contrib/0.7.2-4': PACKAGE_FILES2, +} + + +def test_debian_first_visit( + swh_config, requests_mock_datadir): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + loader = DebianLoader( + url='deb://Debian/packages/cicero', + date='2019-10-12T05:58:09.165557+00:00', + packages=PACKAGE_PER_VERSION) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + assert { + 'content': 42, + 'directory': 2, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, # all artifacts under 1 revision + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_snapshot = { + 'id': '3b6b66e6ee4e7d903a379a882684a2a50480c0b4', + 'branches': { + 'releases/stretch/contrib/0.7.2-3': { + 'target_type': 'revision', + 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', + } + }, + } # different than the previous loader as no release is done + + check_snapshot(expected_snapshot, loader.storage) + + +def test_debian_first_visit_then_another_visit( + swh_config, requests_mock_datadir): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + url = 'deb://Debian/packages/cicero' + loader = DebianLoader( + url=url, + date='2019-10-12T05:58:09.165557+00:00', + packages=PACKAGE_PER_VERSION) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + + stats = get_stats(loader.storage) + assert { + 'content': 42, + 'directory': 2, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, # all artifacts under 1 revision + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_snapshot = { + 'id': '3b6b66e6ee4e7d903a379a882684a2a50480c0b4', + 'branches': { + 'releases/stretch/contrib/0.7.2-3': { + 'target_type': 'revision', + 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', + } + }, + } # different than the previous loader as no release is done + + check_snapshot(expected_snapshot, loader.storage) + + # No change in between load + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url)) + assert origin_visit2[-1]['status'] == 'full' + + stats2 = get_stats(loader.storage) + assert { + 'content': 42 + 0, + 'directory': 2 + 0, + 'origin': 1, + 'origin_visit': 1 + 1, # a new visit occurred + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, # same snapshot across 2 visits + } == stats2 + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('http://deb.debian.org') + ] + # visited each package artifact twice across 2 visits + assert len(urls) == len(set(urls)) + + +def test_uid_to_person(): + uid = 'Someone Name ' + actual_person = uid_to_person(uid) + + assert actual_person == { + 'name': 'Someone Name', + 'email': 'someone@orga.org', + 'fullname': uid, + } + + +def test_prepare_person(): + actual_author = prepare_person({ + 'name': 'Someone Name', + 'email': 'someone@orga.org', + 'fullname': 'Someone Name ', + }) + + assert actual_author == { + 'name': b'Someone Name', + 'email': b'someone@orga.org', + 'fullname': b'Someone Name ', + } + + +def test_download_package(datadir, tmpdir, requests_mock_datadir): + tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) + all_hashes = download_package(PACKAGE_FILES, tmpdir) + assert all_hashes == { + 'cicero_0.7.2-3.diff.gz': { + 'checksums': { + 'blake2s256': '08b1c438e70d2474bab843d826515147fa4a817f8c4baaf3ddfbeb5132183f21', # noqa + 'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc', + 'sha1_git': '834ac91da3a9da8f23f47004bb456dd5bd16fe49', + 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa + }, + 'filename': 'cicero_0.7.2-3.diff.gz', + 'length': 3964}, + 'cicero_0.7.2-3.dsc': { + 'checksums': { + 'blake2s256': '8c002bead3e35818eaa9d00826f3d141345707c58fb073beaa8abecf4bde45d2', # noqa + 'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd', + 'sha1_git': '1f94b2086fa1142c2df6b94092f5c5fa11093a8e', + 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa + }, + 'filename': 'cicero_0.7.2-3.dsc', + 'length': 1864}, + 'cicero_0.7.2.orig.tar.gz': { + 'checksums': { + 'blake2s256': '9809aa8d2e2dad7f34cef72883db42b0456ab7c8f1418a636eebd30ab71a15a6', # noqa + 'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43', + 'sha1_git': 'aa0a38978dce86d531b5b0299b4a616b95c64c74', + 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa + }, + 'filename': 'cicero_0.7.2.orig.tar.gz', + 'length': 96527 + } + } + + +def test_dsc_information_ok(): + fname = 'cicero_0.7.2-3.dsc' + dsc_url, dsc_name = dsc_information(PACKAGE_FILES) + + assert dsc_url == PACKAGE_FILES['files'][fname]['uri'] + assert dsc_name == PACKAGE_FILES['files'][fname]['name'] + + +def test_dsc_information_not_found(): + fname = 'cicero_0.7.2-3.dsc' + package_files = copy.deepcopy(PACKAGE_FILES) + package_files['files'].pop(fname) + + dsc_url, dsc_name = dsc_information(package_files) + + assert dsc_url is None + assert dsc_name is None + + +def test_dsc_information_too_many_dsc_entries(): + # craft an extra dsc file + fname = 'cicero_0.7.2-3.dsc' + package_files = copy.deepcopy(PACKAGE_FILES) + data = package_files['files'][fname] + fname2 = fname.replace('cicero', 'ciceroo') + package_files['files'][fname2] = data + + with pytest.raises( + ValueError, match='Package %s_%s references several dsc' % ( + package_files['name'], package_files['version'])): + dsc_information(package_files) + + +def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path): + tmp_path = str(tmp_path) # py3.5 compat. + package = PACKAGE_FILES + + logger.debug('package: %s', package) + + # download the packages + all_hashes = download_package(package, tmp_path) + + # Retrieve information from package + _, dsc_name = dsc_information(package) + + dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()] + + # Extract information from package + extracted_path = extract_package(dl_artifacts, tmp_path) + + # Retrieve information on package + dsc_path = path.join(path.dirname(extracted_path), dsc_name) + actual_package_info = get_package_metadata( + package, dsc_path, extracted_path) + + logger.debug('actual_package_info: %s', actual_package_info) + + assert actual_package_info == { + 'changelog': { + 'date': '2014-10-19T16:52:35+02:00', + 'history': [ + ('cicero', '0.7.2-2'), + ('cicero', '0.7.2-1'), + ('cicero', '0.7-1') + ], + 'person': { + 'email': 'sthibault@debian.org', + 'fullname': 'Samuel Thibault ', + 'name': 'Samuel Thibault' + } + }, + 'maintainers': [ + { + 'email': 'debian-accessibility@lists.debian.org', + 'fullname': 'Debian Accessibility Team ' + '', + 'name': 'Debian Accessibility Team' + }, + { + 'email': 'sthibault@debian.org', + 'fullname': 'Samuel Thibault ', + 'name': 'Samuel Thibault' + } + ], + 'name': 'cicero', + 'version': '0.7.2-3' + } + + +def test_debian_multiple_packages(swh_config, requests_mock_datadir): + url = 'deb://Debian/packages/cicero' + loader = DebianLoader( + url=url, + date='2019-10-12T05:58:09.165557+00:00', + packages=PACKAGES_PER_VERSION) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + + expected_snapshot = { + 'id': 'defc19021187f3727293121fcf6c5c82cb923604', + 'branches': { + 'releases/stretch/contrib/0.7.2-3': { + 'target_type': 'revision', + 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', + }, + 'releases/buster/contrib/0.7.2-4': { + 'target_type': 'revision', + 'target': '8224139c274c984147ef4b09aa0e462c55a10bd3', + } + }, + } + + check_snapshot(expected_snapshot, loader.storage) diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py new file mode 100644 index 0000000..8e580e9 --- /dev/null +++ b/swh/loader/package/tests/test_deposit.py @@ -0,0 +1,204 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re + + +from swh.model.hashutil import hash_to_bytes +from swh.loader.package.deposit import DepositLoader + +from swh.loader.package.tests.common import ( + check_snapshot, check_metadata_paths, get_stats +) + +from swh.core.pytest_plugin import requests_mock_datadir_factory + + +def test_deposit_init_ok(swh_config): + url = 'some-url' + deposit_id = 999 + loader = DepositLoader(url, deposit_id) # Something that does not exist + + assert loader.url == url + assert loader.archive_url == '/%s/raw/' % deposit_id + assert loader.metadata_url == '/%s/meta/' % deposit_id + assert loader.deposit_update_url == '/%s/update/' % deposit_id + assert loader.client is not None + + +def test_deposit_loading_failure_to_fetch_metadata(swh_config): + """Error during fetching artifact ends us with failed/partial visit + + """ + # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' + url = 'some-url' + unknown_deposit_id = 666 + loader = DepositLoader(url, unknown_deposit_id) # does not exist + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'failed' + + stats = get_stats(loader.storage) + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 0, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ + 'https://deposit.softwareheritage.org/1/private/666/raw/', +]) + + +def test_deposit_loading_failure_to_retrieve_1_artifact( + swh_config, requests_mock_datadir_missing_one): + """Deposit with missing artifact ends up with an uneventful/partial visit + + """ + # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' + url = 'some-url-2' + deposit_id = 666 + loader = DepositLoader(url, deposit_id) + + assert loader.archive_url + actual_load_status = loader.load() + assert actual_load_status['status'] == 'uneventful' + + stats = get_stats(loader.storage) + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +def test_revision_metadata_structure(swh_config, requests_mock_datadir): + # do not care for deposit update query + requests_mock_datadir.put(re.compile('https')) + + url = 'https://hal-test.archives-ouvertes.fr/some-external-id' + deposit_id = 666 + loader = DepositLoader(url, deposit_id) + + assert loader.archive_url + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + expected_revision_id = hash_to_bytes( + '9471c606239bccb1f269564c9ea114e1eeab9eb4') + revision = list(loader.storage.revision_get([expected_revision_id]))[0] + + assert revision is not None + + check_metadata_paths(revision['metadata'], paths=[ + ('extrinsic.provider', str), + ('extrinsic.when', str), + ('extrinsic.raw', dict), + ('original_artifact', list), + ]) + + for original_artifact in revision['metadata']['original_artifact']: + check_metadata_paths(original_artifact, paths=[ + ('filename', str), + ('length', int), + ('checksums', dict), + ]) + + +def test_deposit_loading_ok(swh_config, requests_mock_datadir): + requests_mock_datadir.put(re.compile('https')) # do not care for put + + url = 'https://hal-test.archives-ouvertes.fr/some-external-id' + deposit_id = 666 + loader = DepositLoader(url, deposit_id) + + assert loader.archive_url + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + assert { + 'content': 303, + 'directory': 12, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + + expected_branches = { + 'HEAD': { + 'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4', + 'target_type': 'revision', + }, + } + + expected_snapshot = { + 'id': '453f455d0efb69586143cd6b6e5897f9906b53a7', + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, storage=loader.storage) + + # check metadata + + tool = { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2", + } + } + + tool = loader.storage.tool_get(tool) + assert tool is not None + assert tool['id'] is not None + + provider = { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": None, + } + + provider = loader.storage.metadata_provider_get_by(provider) + assert provider is not None + assert provider['id'] is not None + + metadata = list(loader.storage.origin_metadata_get_by( + url, provider_type='deposit_client')) + assert metadata is not None + assert isinstance(metadata, list) + assert len(metadata) == 1 + metadata0 = metadata[0] + + assert metadata0['provider_id'] == provider['id'] + assert metadata0['provider_type'] == 'deposit_client' + assert metadata0['tool_id'] == tool['id'] diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py deleted file mode 100644 index 0159038..0000000 --- a/swh/loader/package/tests/test_loader.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import os -import requests_mock - -from typing import List - -from swh.loader.package.loader import GNULoader -from swh.loader.core.tests import BaseLoaderStorageTest -from swh.loader.package.tests.common import ( - package, package_url, - tarball, init_test_data -) - -_LOADER_TESTS_CONFIG = { - 'content_packet_size': 10000, - 'content_packet_size_bytes': 104857600, - 'content_size_limit': 104857600, - 'debug': False, - 'directory_packet_size': 25000, - 'occurrence_packet_size': 100000, - 'release_packet_size': 100000, - 'revision_packet_size': 100000, - 'send_contents': True, - 'send_directories': True, - 'send_releases': True, - 'send_revisions': True, - 'send_snapshot': True, - 'storage': {'args': {}, 'cls': 'memory'}, - 'temp_directory': '/tmp/swh.loader.gnu/' -} - - -class GNULoaderTest(GNULoader): - def parse_config_file(self, *args, **kwargs): - return _LOADER_TESTS_CONFIG - - -@requests_mock.Mocker() -class TestGNULoader(unittest.TestCase, BaseLoaderStorageTest): - - _expected_new_contents_first_visit = [ - 'e9258d81faf5881a2f96a77ba609396f82cb97ad', - '1170cf105b04b7e2822a0e09d2acf71da7b9a130', - 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', - '0057bec9b5422aff9256af240b177ac0e3ac2608', - '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', - '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', - '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', - 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', - 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', - 'd64e64d4c73679323f8d4cde2643331ba6c20af9', - '7a756602914be889c0a2d3952c710144b3e64cb0', - '84fb589b554fcb7f32b806951dcf19518d67b08f', - '8624bcdae55baeef00cd11d5dfcfa60f68710a02', - 'e08441aeab02704cfbd435d6445f7c072f8f524e', - 'f67935bc3a83a67259cda4b2d43373bd56703844', - '809788434b433eb2e3cfabd5d591c9a659d5e3d8', - '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', - 'b99fec102eb24bffd53ab61fc30d59e810f116a2', - '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', - 'f0c97052e567948adf03e641301e9983c478ccff', - '7fb724242e2b62b85ca64190c31dcae5303e19b3', - '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', - '7350628ccf194c2c3afba4ac588c33e3f3ac778d', - '0bb892d9391aa706dc2c3b1906567df43cbe06a2', - '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', - '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', - '3046e5d1f70297e2a507b98224b6222c9688d610', - '1572607d456d7f633bc6065a2b3048496d679a31', - ] - - _expected_new_directories_first_visit = [ - 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', - '263be23b4a8101d3ad0d9831319a3e0f2b065f36', - '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', - '4db0a3ecbc976083e2dac01a62f93729698429a3', - 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', - 'eca971d346ea54d95a6e19d5051f900237fafdaa', - '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', - ] - - _expected_new_revisions_first_visit = { - '44183488c0774ce3c957fa19ba695cf18a4a42b3': - '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' - } - - _expected_branches_first_visit = { - 'HEAD': { - 'target': 'release/8sync-0.1.0', - 'target_type': 'alias' - }, - 'release/8sync-0.1.0': { - 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', - 'target_type': 'revision' - }, - } - _expected_new_snapshot_first_visit = '2ae491bbaeef7351641997d1b9193aa2a67d26bc' # noqa - - _expected_new_contents_invalid_origin = [] # type: List[str] - _expected_new_directories_invalid_origin = [] # type: List[str] - - @classmethod - def setUpClass(cls): - cls.reset_loader() - - @classmethod - def reset_loader(cls): - cls.loader = GNULoaderTest() - cls.storage = cls.loader.storage - - def reset_loader_counters(self): - counters_reset = dict.fromkeys(self.loader.counters.keys(), 0) - self.loader.counters.update(counters_reset) - - def test_gnu_loader_first_visit_success(self, mock_tarball_request): - """In this scenario no visit as taken place prior to this visit. - - """ - self.reset_loader() - init_test_data(mock_tarball_request) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertCountContents(len(self._expected_new_contents_first_visit)) - self.assertContentsContain(self._expected_new_contents_first_visit) - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_first_visit)) - - self.assertCountDirectories(len(self._expected_new_directories_first_visit)) # noqa - self.assertDirectoriesContain(self._expected_new_directories_first_visit) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_first_visit)) - - self.assertCountRevisions(1, '1 artifact releases so 1 revisions should be created') # noqa - self.assertRevisionsContain(self._expected_new_revisions_first_visit) - self.assertEqual(self.loader.counters['revisions'], - len(self._expected_new_revisions_first_visit)) - - self.assertCountReleases(0, 'No release is created by the loader') - self.assertEqual(self.loader.counters['releases'], 0) - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - self.assertSnapshotEqual(self._expected_new_snapshot_first_visit, - self._expected_branches_first_visit) - - self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) - self.assertEqual(self.loader.visit_status(), 'full') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) - - def test_gnu_loader_origin_invalid(self, mock_tarball_request): - """In this scenario, tarball link is not valid and will give 404 error - - """ - self.reset_loader() - mock_tarball_request.get( - 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - text='Not Found', status_code=404) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertContentsContain(self._expected_new_contents_invalid_origin) - self.assertCountContents(len(self._expected_new_contents_invalid_origin)) # noqa - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_invalid_origin)) - - self.assertDirectoriesContain(self._expected_new_directories_invalid_origin) # noqa - self.assertCountDirectories(len(self._expected_new_directories_invalid_origin)) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_invalid_origin)) - - self.assertCountRevisions(0, '0 releases so 0 revisions should be created') # noqa - - self.assertEqual(self.loader.counters['releases'], 0) - self.assertCountReleases(0, 'No release is created by the loader') - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - - self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) - self.assertEqual(self.loader.visit_status(), 'partial') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) - - def test_gnu_loader_second_visit(self, mock_tarball_request): - """This scenario makes use of the incremental nature of the loader. - - In this test there is no change from the first visit. So same result - as first visit. - """ - self.reset_loader() - init_test_data(mock_tarball_request) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertCountContents(len(self._expected_new_contents_first_visit)) - self.assertContentsContain(self._expected_new_contents_first_visit) - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_first_visit)) - - self.assertCountDirectories(len(self._expected_new_directories_first_visit)) # noqa - self.assertDirectoriesContain(self._expected_new_directories_first_visit) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_first_visit)) - - self.assertCountRevisions(1, '1 artifact releases so 1 revisions should be created') # noqa - self.assertRevisionsContain(self._expected_new_revisions_first_visit) - self.assertEqual(self.loader.counters['revisions'], - len(self._expected_new_revisions_first_visit)) - - self.assertCountReleases(0, 'No release is created by the loader') - self.assertEqual(self.loader.counters['releases'], 0) - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - self.assertSnapshotEqual(self._expected_new_snapshot_first_visit, - self._expected_branches_first_visit) - - self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) - self.assertEqual(self.loader.visit_status(), 'full') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) diff --git a/swh/loader/package/tests/test_npm.py b/swh/loader/package/tests/test_npm.py new file mode 100644 index 0000000..2ca63d0 --- /dev/null +++ b/swh/loader/package/tests/test_npm.py @@ -0,0 +1,526 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import json +import os + +from swh.model.hashutil import hash_to_bytes + +from swh.loader.package.npm import ( + parse_npm_package_author, extract_npm_package_author +) +from swh.loader.package.tests.common import ( + check_snapshot, check_metadata_paths, get_stats +) + +from swh.loader.package.npm import NpmLoader + + +def _parse_author_string_test(author_str, expected_result): + assert parse_npm_package_author(author_str) == expected_result + assert parse_npm_package_author(' %s' % author_str) == expected_result + assert parse_npm_package_author('%s ' % author_str) == expected_result + + +def test_parse_npm_package_author(): + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + '', + { + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + '(https://john.doe)', + { + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + ' (https://john.doe)', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + '(https://john.doe) ', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe) ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + _parse_author_string_test('', {}) + _parse_author_string_test('<>', {}) + _parse_author_string_test(' <>', {}) + _parse_author_string_test('<>()', {}) + _parse_author_string_test('<> ()', {}) + _parse_author_string_test('()', {}) + _parse_author_string_test(' ()', {}) + + _parse_author_string_test( + 'John Doe <> ()', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe <>', + { + 'name': 'John Doe' + } + ) + + _parse_author_string_test( + 'John Doe ()', + { + 'name': 'John Doe' + } + ) + + +def test_extract_npm_package_author(datadir): + package_metadata_filepath = os.path.join( + datadir, 'https_replicate.npmjs.com', 'org_visit1') + + with open(package_metadata_filepath) as json_file: + package_metadata = json.load(json_file) + + extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ + { + 'fullname': b'mooz ', + 'name': b'mooz', + 'email': b'stillpedant@gmail.com' + } + + assert ( + extract_npm_package_author(package_metadata['versions']['0.0.3']) == + { + 'fullname': b'Masafumi Oyamada ', + 'name': b'Masafumi Oyamada', + 'email': b'stillpedant@gmail.com' + } + ) + + package_json = json.loads(''' + { + "name": "highlightjs-line-numbers.js", + "version": "2.7.0", + "description": "Highlight.js line numbers plugin.", + "main": "src/highlightjs-line-numbers.js", + "dependencies": {}, + "devDependencies": { + "gulp": "^4.0.0", + "gulp-rename": "^1.4.0", + "gulp-replace": "^0.6.1", + "gulp-uglify": "^1.2.0" + }, + "repository": { + "type": "git", + "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" + }, + "author": "Yauheni Pakala ", + "license": "MIT", + "bugs": { + "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" + }, + "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" + }''') # noqa + + assert extract_npm_package_author(package_json) == \ + { + 'fullname': b'Yauheni Pakala ', + 'name': b'Yauheni Pakala', + 'email': b'evgeniy.pakalo@gmail.com' + } + + package_json = json.loads(''' + { + "name": "3-way-diff", + "version": "0.0.1", + "description": "3-way diffing of JavaScript objects", + "main": "index.js", + "authors": [ + { + "name": "Shawn Walsh", + "url": "https://github.com/shawnpwalsh" + }, + { + "name": "Markham F Rollins IV", + "url": "https://github.com/mrollinsiv" + } + ], + "keywords": [ + "3-way diff", + "3 way diff", + "three-way diff", + "three way diff" + ], + "devDependencies": { + "babel-core": "^6.20.0", + "babel-preset-es2015": "^6.18.0", + "mocha": "^3.0.2" + }, + "dependencies": { + "lodash": "^4.15.0" + } + }''') + + assert extract_npm_package_author(package_json) == \ + { + 'fullname': b'Shawn Walsh', + 'name': b'Shawn Walsh', + 'email': None + } + + package_json = json.loads(''' + { + "name": "yfe-ynpm", + "version": "1.0.0", + "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", + "repository": { + "type": "git", + "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" + }, + "author": [ + "fengmk2 (https://fengmk2.com)", + "xufuzi (https://7993.org)" + ], + "license": "MIT" + }''') + + assert extract_npm_package_author(package_json) == \ + { + 'fullname': b'fengmk2 ', + 'name': b'fengmk2', + 'email': b'fengmk2@gmail.com' + } + + package_json = json.loads(''' + { + "name": "umi-plugin-whale", + "version": "0.0.8", + "description": "Internal contract component", + "authors": { + "name": "xiaohuoni", + "email": "448627663@qq.com" + }, + "repository": "alitajs/whale", + "devDependencies": { + "np": "^3.0.4", + "umi-tools": "*" + }, + "license": "MIT" + }''') + + assert extract_npm_package_author(package_json) == \ + { + 'fullname': b'xiaohuoni <448627663@qq.com>', + 'name': b'xiaohuoni', + 'email': b'448627663@qq.com' + } + + +def normalize_hashes(hashes): + if isinstance(hashes, str): + return hash_to_bytes(hashes) + if isinstance(hashes, list): + return [hash_to_bytes(x) for x in hashes] + return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} + + +_expected_new_contents_first_visit = normalize_hashes([ + '4ce3058e16ab3d7e077f65aabf855c34895bf17c', + '858c3ceee84c8311adc808f8cdb30d233ddc9d18', + '0fa33b4f5a4e0496da6843a38ff1af8b61541996', + '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', + '9163ac8025923d5a45aaac482262893955c9b37b', + '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', + '18c03aac6d3e910efb20039c15d70ab5e0297101', + '41265c42446aac17ca769e67d1704f99e5a1394d', + '783ff33f5882813dca9239452c4a7cadd4dba778', + 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', + '112d1900b4c2e3e9351050d1b542c9744f9793f3', + '5439bbc4bd9a996f1a38244e6892b71850bc98fd', + 'd83097a2f994b503185adf4e719d154123150159', + 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', + 'b3523a26f7147e4af40d9d462adaae6d49eda13e', + 'cd065fb435d6fb204a8871bcd623d0d0e673088c', + '2854a40855ad839a54f4b08f5cff0cf52fca4399', + 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', + '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', + '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', + '585fc5caab9ead178a327d3660d35851db713df1', + 'e8cd41a48d79101977e3036a87aeb1aac730686f', + '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', + '9c3cc2763bf9e9e37067d3607302c4776502df98', + '3649a68410e354c83cd4a38b66bd314de4c8f5c9', + 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', + '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', + '38de737da99514de6559ff163c988198bc91367a', +]) + +_expected_new_directories_first_visit = normalize_hashes([ + '3370d20d6f96dc1c9e50f083e2134881db110f4f', + '42753c0c2ab00c4501b552ac4671c68f3cf5aece', + 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', + '80579be563e2ef3e385226fe7a3f079b377f142c', + '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', + 'bcad03ce58ac136f26f000990fc9064e559fe1c0', + '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', + 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', + '584b5b4b6cf7f038095e820b99386a9c232de931', + '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', + 'bb5f4ee143c970367eb409f2e4c1104898048b9d', + '1b95491047add1103db0dfdfa84a9735dcb11e88', + 'a00c6de13471a2d66e64aca140ddb21ef5521e62', + '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', + 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', + '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', + '775cc516543be86c15c1dc172f49c0d4e6e78235', + 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', +]) + +_expected_new_revisions_first_visit = normalize_hashes({ + 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': + '42753c0c2ab00c4501b552ac4671c68f3cf5aece', + '5f9eb78af37ffd12949f235e86fac04898f9f72a': + '3370d20d6f96dc1c9e50f083e2134881db110f4f', + 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': + 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} +) + + +def package_url(package): + return 'https://www.npmjs.com/package/%s' % package + + +def package_metadata_url(package): + return 'https://replicate.npmjs.com/%s/' % package + + +def test_revision_metadata_structure(swh_config, requests_mock_datadir): + package = 'org' + loader = NpmLoader(package, + package_url(package), + package_metadata_url(package)) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + expected_revision_id = hash_to_bytes( + 'd8a1c7474d2956ac598a19f0f27d52f7015f117e') + revision = list(loader.storage.revision_get([expected_revision_id]))[0] + + assert revision is not None + + check_metadata_paths(revision['metadata'], paths=[ + ('intrinsic.tool', str), + ('intrinsic.raw', dict), + ('extrinsic.provider', str), + ('extrinsic.when', str), + ('extrinsic.raw', dict), + ('original_artifact', list), + ]) + + for original_artifact in revision['metadata']['original_artifact']: + check_metadata_paths(original_artifact, paths=[ + ('filename', str), + ('length', int), + ('checksums', dict), + ]) + + +def test_npm_loader_first_visit(swh_config, requests_mock_datadir): + + package = 'org' + loader = NpmLoader(package, + package_url(package), + package_metadata_url(package)) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 2, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + assert len(list(loader.storage.content_get( + _expected_new_contents_first_visit))) == len( + _expected_new_contents_first_visit) + + assert list(loader.storage.directory_missing( + _expected_new_directories_first_visit)) == [] + + assert list(loader.storage.revision_missing( + _expected_new_revisions_first_visit)) == [] + + expected_snapshot = { + 'id': 'd0587e1195aed5a8800411a008f2f2d627f18e2d', + 'branches': { + 'HEAD': { + 'target': 'releases/0.0.4', + 'target_type': 'alias' + }, + 'releases/0.0.2': { + 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', + 'target_type': 'revision' + }, + 'releases/0.0.3': { + 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', + 'target_type': 'revision' + }, + 'releases/0.0.4': { + 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', + 'target_type': 'revision' + } + } + } + check_snapshot(expected_snapshot, loader.storage) + + +def test_npm_loader_incremental_visit( + swh_config, requests_mock_datadir_visits): + package = 'org' + url = package_url(package) + metadata_url = package_metadata_url(package) + loader = NpmLoader(package, url, metadata_url) + + actual_load_status = loader.load() + + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + stats = get_stats(loader.storage) + + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 2, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + loader._info = None # reset loader internal state + actual_load_status2 = loader.load() + + assert actual_load_status2['status'] == 'eventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + stats = get_stats(loader.storage) + + assert { # 3 new releases artifacts + 'content': len(_expected_new_contents_first_visit) + 14, + 'directory': len(_expected_new_directories_first_visit) + 15, + 'origin': 1, + 'origin_visit': 2, + 'person': 2, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit) + 3, + 'skipped_content': 0, + 'snapshot': 2, + } == stats + + urls = [ + m.url for m in requests_mock_datadir_visits.request_history + if m.url.startswith('https://registry.npmjs.org') + ] + assert len(urls) == len(set(urls)) # we visited each artifact once across diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py new file mode 100644 index 0000000..5a05a97 --- /dev/null +++ b/swh/loader/package/tests/test_pypi.py @@ -0,0 +1,661 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from os import path + +import pytest + +from unittest.mock import patch + +from swh.core.tarball import uncompress +from swh.core.pytest_plugin import requests_mock_datadir_factory +from swh.model.hashutil import hash_to_bytes + +from swh.loader.package.pypi import ( + PyPILoader, pypi_api_url, author, extract_intrinsic_metadata +) +from swh.loader.package.tests.common import ( + check_snapshot, check_metadata_paths, get_stats +) + + +def test_author_basic(): + data = { + 'author': "i-am-groot", + 'author_email': 'iam@groot.org', + } + actual_author = author(data) + + expected_author = { + 'fullname': b'i-am-groot ', + 'name': b'i-am-groot', + 'email': b'iam@groot.org', + } + + assert actual_author == expected_author + + +def test_author_empty_email(): + data = { + 'author': 'i-am-groot', + 'author_email': '', + } + actual_author = author(data) + + expected_author = { + 'fullname': b'i-am-groot', + 'name': b'i-am-groot', + 'email': b'', + } + + assert actual_author == expected_author + + +def test_author_empty_name(): + data = { + 'author': "", + 'author_email': 'iam@groot.org', + } + actual_author = author(data) + + expected_author = { + 'fullname': b' ', + 'name': b'', + 'email': b'iam@groot.org', + } + + assert actual_author == expected_author + + +def test_author_malformed(): + data = { + 'author': "['pierre', 'paul', 'jacques']", + 'author_email': None, + } + + actual_author = author(data) + + expected_author = { + 'fullname': b"['pierre', 'paul', 'jacques']", + 'name': b"['pierre', 'paul', 'jacques']", + 'email': None, + } + + assert actual_author == expected_author + + +def test_author_malformed_2(): + data = { + 'author': '[marie, jeanne]', + 'author_email': '[marie@some, jeanne@thing]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', + 'name': b'[marie, jeanne]', + 'email': b'[marie@some, jeanne@thing]', + } + + assert actual_author == expected_author + + +def test_author_malformed_3(): + data = { + 'author': '[marie, jeanne, pierre]', + 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa + 'name': b'[marie, jeanne, pierre]', + 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', + } + + actual_author == expected_author + + +# configuration error # + +def test_badly_configured_loader_raise(monkeypatch): + """Badly configured loader should raise""" + monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) + with pytest.raises(ValueError) as e: + PyPILoader(url='some-url') + + assert 'Misconfiguration' in e.value.args[0] + + +def test_pypi_api_url(): + """Compute pypi api url from the pypi project url should be ok""" + url = pypi_api_url('https://pypi.org/project/requests') + assert url == 'https://pypi.org/pypi/requests/json' + + +@pytest.mark.fs +def test_extract_intrinsic_metadata(tmp_path, datadir): + """Parsing existing archive's PKG-INFO should yield results""" + uncompressed_archive_path = str(tmp_path) + archive_path = path.join( + datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip') + uncompress(archive_path, dest=uncompressed_archive_path) + + actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) + expected_metadata = { + 'metadata_version': '1.0', + 'name': '0805nexter', + 'version': '1.1.0', + 'summary': 'a simple printer of nested lest', + 'home_page': 'http://www.hp.com', + 'author': 'hgtkpython', + 'author_email': '2868989685@qq.com', + 'platforms': ['UNKNOWN'], + } + + assert actual_metadata == expected_metadata + + +@pytest.mark.fs +def test_extract_intrinsic_metadata_failures(tmp_path): + """Parsing inexistant path/archive/PKG-INFO yield None""" + tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) + # inexistant first level path + assert extract_intrinsic_metadata('/something-inexistant') == {} + # inexistant second level path (as expected by pypi archives) + assert extract_intrinsic_metadata(tmp_path) == {} + # inexistant PKG-INFO within second level path + existing_path_no_pkginfo = path.join(tmp_path, 'something') + os.mkdir(existing_path_no_pkginfo) + assert extract_intrinsic_metadata(tmp_path) == {} + + +# LOADER SCENARIO # + +# "edge" cases (for the same origin) # + + +# no release artifact: +# {visit full, status: uneventful, no contents, etc...} +requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[ + 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa + 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa +]) + + +def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all): + """Load a pypi project with all artifacts missing ends up with no snapshot + + """ + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'uneventful' + + stats = get_stats(loader.storage) + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +# problem during loading: +# {visit: partial, status: uneventful, no snapshot} + + +def test_release_with_traceback(swh_config): + url = 'https://pypi.org/project/0805nexter' + with patch('swh.loader.package.pypi.PyPILoader.get_default_version', + side_effect=ValueError('Problem')): + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'failed' + + stats = get_stats(loader.storage) + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 0, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +# problem during loading: failure early enough in between swh contents... +# some contents (contents, directories, etc...) have been written in storage +# {visit: partial, status: eventful, no snapshot} + +# problem during loading: failure late enough we can have snapshots (some +# revisions are written in storage already) +# {visit: partial, status: eventful, snapshot} + +# "normal" cases (for the same origin) # + + +requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ + 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa +]) + +# some missing release artifacts: +# {visit partial, status: eventful, 1 snapshot} + + +def test_revision_metadata_structure(swh_config, requests_mock_datadir): + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + expected_revision_id = hash_to_bytes( + 'e445da4da22b31bfebb6ffc4383dbf839a074d21') + revision = list(loader.storage.revision_get([expected_revision_id]))[0] + + assert revision is not None + + check_metadata_paths(revision['metadata'], paths=[ + ('intrinsic.tool', str), + ('intrinsic.raw', dict), + ('extrinsic.provider', str), + ('extrinsic.when', str), + ('extrinsic.raw', dict), + ('original_artifact', list), + ]) + + for original_artifact in revision['metadata']['original_artifact']: + check_metadata_paths(original_artifact, paths=[ + ('filename', str), + ('length', int), + ('checksums', dict), + ]) + + +def test_visit_with_missing_artifact( + swh_config, requests_mock_datadir_missing_one): + """Load a pypi project with some missing artifacts ends up with 1 snapshot + + """ + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + + assert { + 'content': 3, + 'directory': 2, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_contents = map(hash_to_bytes, [ + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ]) + + assert list(loader.storage.content_missing_per_sha1(expected_contents))\ + == [] + + expected_dirs = map(hash_to_bytes, [ + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ]) + + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + # {revision hash: directory hash} + expected_revs = { + hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa + } + assert list(loader.storage.revision_missing(expected_revs)) == [] + + expected_branches = { + 'releases/1.2.0': { + 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', + 'target_type': 'revision', + }, + 'HEAD': { + 'target': 'releases/1.2.0', + 'target_type': 'alias', + }, + } + + expected_snapshot = { + 'id': 'dd0e4201a232b1c104433741dbf45895b8ac9355', + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, storage=loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir): + """With no prior visit, load a pypi project ends up with 1 snapshot + + """ + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + assert { + 'content': 6, + 'directory': 4, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 2, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_contents = map(hash_to_bytes, [ + 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', + '938c33483285fd8ad57f15497f538320df82aeb8', + 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ]) + + assert list(loader.storage.content_missing_per_sha1(expected_contents))\ + == [] + + expected_dirs = map(hash_to_bytes, [ + '05219ba38bc542d4345d5638af1ed56c7d43ca7d', + 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ]) + + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + # {revision hash: directory hash} + expected_revs = { + hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa + hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa + } + assert list(loader.storage.revision_missing(expected_revs)) == [] + + expected_branches = { + 'releases/1.1.0': { + 'target': '4c99891f93b81450385777235a37b5e966dd1571', + 'target_type': 'revision', + }, + 'releases/1.2.0': { + 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', + 'target_type': 'revision', + }, + 'HEAD': { + 'target': 'releases/1.2.0', + 'target_type': 'alias', + }, + } + + expected_snapshot = { + 'id': 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a', + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + + +def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir): + """Multiple visits with no changes results in 1 same snapshot + + """ + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + stats = get_stats(loader.storage) + + assert { + 'content': 6, + 'directory': 4, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 2, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_branches = { + 'releases/1.1.0': { + 'target': '4c99891f93b81450385777235a37b5e966dd1571', + 'target_type': 'revision', + }, + 'releases/1.2.0': { + 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', + 'target_type': 'revision', + }, + 'HEAD': { + 'target': 'releases/1.2.0', + 'target_type': 'alias', + }, + } + + snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' + expected_snapshot = { + 'id': snapshot_id, + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + + stats2 = get_stats(loader.storage) + expected_stats2 = stats.copy() + expected_stats2['origin_visit'] = 1 + 1 + assert expected_stats2 == stats2 + + # same snapshot + actual_snapshot_id = origin_visit['snapshot'] + assert actual_snapshot_id == hash_to_bytes(snapshot_id) + + +def test_incremental_visit(swh_config, requests_mock_datadir_visits): + """With prior visit, 2nd load will result with a different snapshot + + """ + url = 'https://pypi.org/project/0805nexter' + loader = PyPILoader(url) + + visit1_actual_load_status = loader.load() + visit1_stats = get_stats(loader.storage) + assert visit1_actual_load_status['status'] == 'eventful' + origin_visit1 = next(loader.storage.origin_visit_get(url)) + assert origin_visit1['status'] == 'full' + + assert { + 'content': 6, + 'directory': 4, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 2, + 'skipped_content': 0, + 'snapshot': 1 + } == visit1_stats + + # Reset internal state + loader._info = None + + visit2_actual_load_status = loader.load() + visit2_stats = get_stats(loader.storage) + + assert visit2_actual_load_status['status'] == 'eventful' + visits = list(loader.storage.origin_visit_get(url)) + assert len(visits) == 2 + assert visits[1]['status'] == 'full' + + assert { + 'content': 6 + 1, # 1 more content + 'directory': 4 + 2, # 2 more directories + 'origin': 1, + 'origin_visit': 1 + 1, + 'person': 1, + 'release': 0, + 'revision': 2 + 1, # 1 more revision + 'skipped_content': 0, + 'snapshot': 1 + 1, # 1 more snapshot + } == visit2_stats + + expected_contents = map(hash_to_bytes, [ + 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', + '938c33483285fd8ad57f15497f538320df82aeb8', + 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + '92689fa2b7fb4d4fc6fb195bf73a50c87c030639' + ]) + + assert list(loader.storage.content_missing_per_sha1(expected_contents))\ + == [] + + expected_dirs = map(hash_to_bytes, [ + '05219ba38bc542d4345d5638af1ed56c7d43ca7d', + 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', + '52604d46843b898f5a43208045d09fcf8731631b', + + ]) + + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + # {revision hash: directory hash} + expected_revs = { + hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa + hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa + hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa + } + + assert list(loader.storage.revision_missing(expected_revs)) == [] + + expected_branches = { + 'releases/1.1.0': { + 'target': '4c99891f93b81450385777235a37b5e966dd1571', + 'target_type': 'revision', + }, + 'releases/1.2.0': { + 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', + 'target_type': 'revision', + }, + 'releases/1.3.0': { + 'target': '51247143b01445c9348afa9edfae31bf7c5d86b1', + 'target_type': 'revision', + }, + 'HEAD': { + 'target': 'releases/1.3.0', + 'target_type': 'alias', + }, + } + expected_snapshot = { + 'id': '2e5149a7b0725d18231a37b342e9b7c4e121f283', + 'branches': expected_branches, + } + + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir_visits.request_history + if m.url.startswith('https://files.pythonhosted.org') + ] + # visited each artifact once across 2 visits + assert len(urls) == len(set(urls)) + + +# release artifact, no new artifact +# {visit full, status uneventful, same snapshot as before} + +# release artifact, old artifact with different checksums +# {visit full, status full, new snapshot with shared history and some new +# different history} + +# release with multiple sdist artifacts per pypi "version" +# snapshot branch output is different + +def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir): + """With no prior visit, load a pypi project ends up with 1 snapshot + + """ + url = 'https://pypi.org/project/nexter' + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + + expected_branches = { + 'releases/1.1.0/nexter-1.1.0.zip': { + 'target': '4c99891f93b81450385777235a37b5e966dd1571', + 'target_type': 'revision', + }, + 'releases/1.1.0/nexter-1.1.0.tar.gz': { + 'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a', + 'target_type': 'revision', + }, + } + + expected_snapshot = { + 'id': 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6', + 'branches': expected_branches, + } + check_snapshot(expected_snapshot, loader.storage) + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' diff --git a/swh/loader/package/tests/test_tasks.py b/swh/loader/package/tests/test_tasks.py new file mode 100644 index 0000000..57acbc4 --- /dev/null +++ b/swh/loader/package/tests/test_tasks.py @@ -0,0 +1,83 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + + +@patch('swh.loader.package.archive.ArchiveLoader.load') +def test_gnu_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.tasks.LoadArchive', + (), dict(url='some-url', artifacts=[])) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} + + +@patch('swh.loader.package.debian.DebianLoader.load') +def test_debian_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.tasks.LoadDebian', + (), dict(url='some-url', date='some-date', packages={})) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} + + +@patch('swh.loader.package.deposit.DepositLoader.load') +def test_deposit_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.tasks.LoadDeposit', + (), dict(url='some-url', deposit_id='some-d-id')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} + + +@patch('swh.loader.package.npm.NpmLoader.load') +def test_npm_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.tasks.LoadNpm', + (), dict(package_name='some-package', + package_url='some', + package_metadata_url='something')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} + + +@patch('swh.loader.package.pypi.PyPILoader.load') +def test_pypi_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.tasks.LoadPyPI', + (), dict(url='some-url')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py new file mode 100644 index 0000000..3e2d17f --- /dev/null +++ b/swh/loader/package/tests/test_utils.py @@ -0,0 +1,153 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import os +import pytest + +from swh.loader.package.utils import download, api_info, release_name + + +@pytest.mark.fs +def test_download_fail_to_download(tmp_path, requests_mock): + url = 'https://pypi.org/pypi/arrow/json' + status_code = 404 + requests_mock.get(url, status_code=status_code) + + with pytest.raises(ValueError) as e: + download(url, tmp_path) + + assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( + url, status_code) + + +@pytest.mark.fs +def test_download_fail_length_mismatch(tmp_path, requests_mock): + """Mismatch length after download should raise + + """ + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + wrong_size = len(data) - 3 + requests_mock.get(url, text=data, headers={ + 'content-length': str(wrong_size) # wrong size! + }) + + with pytest.raises(ValueError) as e: + download(url, dest=str(tmp_path)) + + assert e.value.args[0] == "Error when checking size: %s != %s" % ( + wrong_size, len(data) + ) + + +@pytest.mark.fs +def test_download_ok(tmp_path, requests_mock): + """Download without issue should provide filename and hashes""" + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) + + actual_filename = os.path.basename(actual_filepath) + assert actual_filename == filename + assert actual_hashes['length'] == len(data) + assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa + assert (actual_hashes['checksums']['sha256'] == + '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') + + +@pytest.mark.fs +def test_download_ok_with_hashes(tmp_path, requests_mock): + """Download without issue should provide filename and hashes""" + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + # good hashes for such file + good = { + 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', + 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa + } + + actual_filepath, actual_hashes = download(url, dest=str(tmp_path), + hashes=good) + + actual_filename = os.path.basename(actual_filepath) + assert actual_filename == filename + assert actual_hashes['length'] == len(data) + assert actual_hashes['checksums']['sha1'] == good['sha1'] + assert actual_hashes['checksums']['sha256'] == good['sha256'] + + +@pytest.mark.fs +def test_download_fail_hashes_mismatch(tmp_path, requests_mock): + """Mismatch hash after download should raise + + """ + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + # good hashes for such file + good = { + 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', + 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa + } + + for hash_algo in good.keys(): + wrong_hash = good[hash_algo].replace('1', '0') + expected_hashes = good.copy() + expected_hashes[hash_algo] = wrong_hash # set the wrong hash + + expected_msg = ("Failure when fetching %s. " + "Checksum mismatched: %s != %s" % ( + url, wrong_hash, good[hash_algo] + )) + + with pytest.raises(ValueError, match=expected_msg): + download(url, dest=str(tmp_path), hashes=expected_hashes) + + +def test_api_info_failure(requests_mock): + """Failure to fetch info/release information should raise""" + url = 'https://pypi.org/pypi/requests/json' + status_code = 400 + requests_mock.get(url, status_code=status_code) + + with pytest.raises(ValueError) as e0: + api_info(url) + + assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( + url, status_code + ) + + +def test_api_info(requests_mock): + """Fetching json info from pypi project should be ok""" + url = 'https://pypi.org/pypi/requests/json' + requests_mock.get(url, text='{"version": "0.0.1"}') + actual_info = api_info(url) + assert actual_info == { + 'version': '0.0.1', + } + + +def test_release_name(): + for version, filename, expected_release in [ + ('0.0.1', None, 'releases/0.0.1'), + ('0.0.2', 'something', 'releases/0.0.2/something')]: + assert release_name(version, filename) == expected_release diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py new file mode 100644 index 0000000..3dedc2b --- /dev/null +++ b/swh/loader/package/utils.py @@ -0,0 +1,111 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import os +import requests + +from typing import Dict, Optional, Tuple + +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.loader.package import DEFAULT_PARAMS + + +logger = logging.getLogger(__name__) + + +def api_info(url: str) -> Dict: + """Basic api client to retrieve information on project. This deals with + fetching json metadata about pypi projects. + + Args: + url (str): The api url (e.g PyPI, npm, etc...) + + Raises: + ValueError in case of query failures (for some reasons: 404, ...) + + Returns: + The associated response's information dict + + """ + response = requests.get(url, **DEFAULT_PARAMS) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response.json() + + +def download(url: str, dest: str, hashes: Dict = {}, + filename: Optional[str] = None) -> Tuple[str, Dict]: + """Download a remote tarball from url, uncompresses and computes swh hashes + on it. + + Args: + url: Artifact uri to fetch, uncompress and hash + dest: Directory to write the archive to + + hashes: Dict of expected hashes (key is the hash algo) for the artifact + to download (those hashes are expected to be hex string) + + Raises: + ValueError in case of any error when fetching/computing (length, + checksums mismatched...) + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + response = requests.get(url, **DEFAULT_PARAMS, stream=True) + logger.debug('headers: %s', response.headers) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + length = int(response.headers['content-length']) + + filename = filename if filename else os.path.basename(url) + logger.debug('filename: %s', filename) + filepath = os.path.join(dest, filename) + logger.debug('filepath: %s', filepath) + + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + # Also check the expected hashes if provided + if hashes: + actual_hashes = h.hexdigest() + for algo_hash in hashes.keys(): + actual_digest = actual_hashes[algo_hash] + expected_digest = hashes[algo_hash] + if actual_digest != expected_digest: + raise ValueError( + 'Failure when fetching %s. ' + 'Checksum mismatched: %s != %s' % ( + url, expected_digest, actual_digest)) + + extrinsic_metadata = { + 'length': length, + 'filename': filename, + 'checksums': { + **h.hexdigest() + }, + } + + logger.debug('extrinsic_metadata', extrinsic_metadata) + + return filepath, extrinsic_metadata + + +def release_name(version: str, filename: Optional[str] = None) -> str: + if filename: + return 'releases/%s/%s' % (version, filename) + return 'releases/%s' % version diff --git a/tox.ini b/tox.ini index 8b6068a..face0d6 100644 --- a/tox.ini +++ b/tox.ini @@ -1,24 +1,33 @@ [tox] envlist=flake8,mypy,py3 [testenv:py3] deps = .[testing] pytest-cov commands = - pytest --cov={envsitepackagesdir}/swh/loader {envsitepackagesdir}/swh/loader --cov-branch {posargs} + pytest --cov={envsitepackagesdir}/swh/loader/ --cov-branch \ + {envsitepackagesdir}/swh/loader/ {posargs} + +[testenv:py3-dev] +deps = + .[testing] + pytest-cov + ipdb +commands = + pytest {envsitepackagesdir}/swh/loader/ {posargs} [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] skip_install = true deps = .[testing] mypy commands = mypy swh