diff --git a/swh/loader/package/__init__.py b/swh/loader/package/__init__.py index e69de29..a8b4a14 100644 --- a/swh/loader/package/__init__.py +++ b/swh/loader/package/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +try: + from swh.loader.core._version import __version__ +except ImportError: + __version__ = 'devel' + + +DEFAULT_PARAMS = { + 'headers': { + 'User-Agent': 'Software Heritage Loader (%s)' % ( + __version__ + ) + } +} diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py index 2aab7f7..e78fe42 100644 --- a/swh/loader/package/gnu.py +++ b/swh/loader/package/gnu.py @@ -1,477 +1,108 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -import shutil -import requests +from os import path -try: - from _version import __version__ -except ImportError: - __version__ = 'devel' +from typing import Generator, Dict, Tuple, Sequence +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import download -from tempfile import mkdtemp - -from swh.core import tarball -from swh.loader.core.utils import clean_dangling_folders -from swh.loader.core.loader import BufferedLoader from swh.model.identifiers import normalize_timestamp -from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE -from swh.model.from_disk import Directory -from swh.model.identifiers import ( - identifier_to_bytes, revision_identifier, snapshot_identifier -) -DEBUG_MODE = '** DEBUG MODE **' +def get_version(url): + """Extract branch name from tarball url + + Args: + url (str): Tarball URL + + Returns: + byte: Branch name + + Example: + For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz + + >>> find_branch_name(url) + b'release/8sync-0.2.0' + """ + branch_name = '' + filename = path.basename(url) + filename_parts = filename.split(".") + if len(filename_parts) > 1 and filename_parts[-2] == 'tar': + for part in filename_parts[:-2]: + branch_name += '.' + part + elif len(filename_parts) > 1 and filename_parts[-1] == 'zip': + for part in filename_parts[:-1]: + branch_name += '.' + part -class GNULoader(BufferedLoader): + return '%s' % branch_name[1:] + +class GNULoader(PackageLoader): + visit_type = 'gnu' SWH_PERSON = { 'name': b'Software Heritage', 'fullname': b'Software Heritage', 'email': b'robot@softwareheritage.org' } REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' - visit_type = 'gnu' - - def __init__(self): - self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.gnu.' - super().__init__(logging_class='swh.loader.package.GNULoader') - - self.dir_path = None - temp_directory = self.config['temp_directory'] - os.makedirs(temp_directory, exist_ok=True) - - self.temp_directory = mkdtemp( - suffix='-%s' % os.getpid(), - prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, - dir=temp_directory) - - self.debug = self.config.get('debug', False) - self.session = requests.session() - self.params = { - 'headers': { - 'User-Agent': 'Software Heritage Loader (%s)' % ( - __version__ - ) - } - } - - def pre_cleanup(self): - """To prevent disk explosion if some other workers exploded - in mid-air (OOM killed), we try and clean up dangling files. - - """ - if self.debug: - self.log.warning('%s Will not pre-clean up temp dir %s' % ( - DEBUG_MODE, self.temp_directory - )) - return - clean_dangling_folders(self.temp_directory, - pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, - log=self.log) - - def prepare_origin_visit(self, name, origin_url, **kwargs): - """Prepare package visit. + def __init__(self, package: str, package_url: str, tarballs: Sequence): + """Loader constructor. - Args: - name (str): Package Name - origin_url (str): Package origin url - **kwargs: Arbitrary keyword arguments passed by the lister. - - """ - # reset statuses - self._load_status = 'uneventful' - self._visit_status = 'full' - self.done = False - - self.origin = { - 'url': origin_url, - 'type': self.visit_type, - } - - self.visit_date = None # loader core will populate it - - def prepare(self, name, origin_url, **kwargs): - """Prepare effective loading of source tarballs for a package manager - package. + For now, this is the lister's task output. Args: - name (str): Package Name - origin_url (str): Package origin url - **kwargs: Arbitrary keyword arguments passed by the lister. + package: Package's name (unused) + package_url: Origin url - """ - self.package_contents = [] - self.package_directories = [] - self.package_revisions = [] - self.all_version_data = [] - self.latest_timestamp = 0 - # Conceled the data into one dictionary to eliminate the need of - # passing all the parameters when required in some method - self.package_details = { - 'name': name, - 'origin_url': origin_url, - 'tarballs': kwargs['tarballs'], - } + tarballs: List of dict with keys `date` (date) and `archive` (str) + the url to retrieve one versioned archive - self.package_temp_dir = os.path.join(self.temp_directory, - self.package_details['name']) - - self.new_versions = \ - self.prepare_package_versions(self.package_details['tarballs']) - - def prepare_package_versions(self, tarballs): """ - Instantiate a generator that will process a specific package release - version at each iteration step. The following operations will be - performed: - - 1. Create a temporary directory to download and extract the - release tarball. - 2. Download the tarball. - 3. Uncompress the tarball. - 4. Parse the file associated to the package version to extract - metadata (optional). - 5. Delete unnecessary files (optional). - - Args: - tarballs (list): a list of dicts containing information about the - respective tarball that is provided by lister. - known_versions (dict): may be provided by the loader, it enables - to filter out versions already ingested in the archive. - - Yields: - Tuple[dict, str]: tuples containing the following - members: - - * a dict holding package tarball information and metadata - * a string holding the path of the uncompressed package to - load into the archive - - """ - for package_version_data in tarballs: - - tarball_url = package_version_data['archive'] - tarball_request = self._request(tarball_url, - throw_error=False) - if tarball_request.status_code == 404: - self.log.warning('Tarball url %s returns a 404 error.', - tarball_url) - self._visit_status = 'partial' - # FIX ME: Do we need to mark it `partial` here - continue - - yield self._prepare_package_version(package_version_data, - tarball_request) - - def _request(self, url, throw_error=True): - """Request the remote tarball url. - - Args: - url (str): Url (file or http*). - - Raises: - ValueError in case of failing to query. - - Returns: - Tuple of local (filepath, hashes of filepath). - - """ - response = self.session.get(url, **self.params, stream=True) - if response.status_code != 200 and throw_error: - raise ValueError("Fail to query '%s'. Reason: %s" % ( - url, response.status_code)) - - return response - - def _prepare_package_version(self, package_version_data, tarball_request): - """Process the package release version. - - The following operations are performed: - - 1. Download the tarball - 2. Uncompress the tarball - 3. Delete unnecessary files (optional) - 4. Parse the file associated to the package version to extract - metadata (optional) - - Args: - package_version_data (dict): containing information - about the focused package version. - known_versions (dict): may be provided by the loader, it enables - to filter out versions already ingested in the archive. - - Return: - Tuple[dict, str]: tuples containing the following - members: - - * a dict holding package tarball information and metadata - * a string holding the path of the uncompressed package to - load into the archive - - """ - url = package_version_data['archive'] - tarball_path, hashes = self.download_generate_hash(tarball_request, - url) - uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed', - os.path.basename(url)) # SEE ME - self.uncompress_tarball(tarball_path, uncompressed_path) - - # remove tarball - os.remove(tarball_path) - - if self.tarball_invalid: - return None, None - - return package_version_data, uncompressed_path - - def download_generate_hash(self, response, url): - """Store file in temp directory and computes hash of its filepath. - - Args: - response (Response): Server response of the url - url (str): Url of the tarball - - Returns: - Tuple of local (filepath, hashes of filepath) - - """ - length = int(response.headers['content-length']) - os.makedirs(self.package_temp_dir, exist_ok=True) - # SEE ME - filepath = os.path.join(self.package_temp_dir, os.path.basename(url)) - - # Convert the server response to a file. - h = MultiHash(length=length) - with open(filepath, 'wb') as f: - for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): - h.update(chunk) - f.write(chunk) - - # Check for the validity of the tarball downloaded. - actual_length = os.path.getsize(filepath) - if length != actual_length: - raise ValueError('Error when checking size: %s != %s' % ( - length, actual_length)) - - hashes = { - 'length': length, - **h.hexdigest() - } - return filepath, hashes - - def uncompress_tarball(self, filepath, path): - """Uncompress a tarball. - - Args: - filepath (str): Path of tarball to uncompress - path (str): The destination folder where to uncompress the tarball - Returns: - The nature of the tarball, zip or tar. - - """ - try: - self.tarball_invalid = False - tarball.uncompress(filepath, path) - except Exception: - self.tarball_invalid = True - self._visit_status = 'partial' - - def fetch_data(self): - """Called once per release artifact version (can be many for one - release). - - This will for each call: - - retrieve a release artifact (associated to a release version) - - Computes the swh objects - - Returns: - True as long as data to fetch exist - - """ - data = None - if self.done: - return False - - try: - data = next(self.new_versions) - self._load_status = 'eventful' - except StopIteration: - self.done = True - return False - - package_version_data, dir_path = data - - # package release tarball was corrupted - if self.tarball_invalid: - return not self.done - - dir_path = dir_path.encode('utf-8') - directory = Directory.from_disk(path=dir_path, data=True) - objects = directory.collect() - - if 'content' not in objects: - objects['content'] = {} - if 'directory' not in objects: - objects['directory'] = {} - - self.package_contents = objects['content'].values() - self.package_directories = objects['directory'].values() - - revision = self.build_revision(directory, - package_version_data) - - revision['id'] = identifier_to_bytes( - revision_identifier(revision)) - self.package_revisions.append(revision) - self.log.debug(revision) - package_version_data['id'] = revision['id'] - self.all_version_data.append(package_version_data) - - # To find the latest version - if self.latest_timestamp < int(package_version_data['date']): - self.latest_timestamp = int(package_version_data['date']) - - self.log.debug('Removing unpacked package files at %s', dir_path) - shutil.rmtree(dir_path) - - return not self.done - - def build_revision(self, directory, package_version_data): - normalize_date = normalize_timestamp(int(package_version_data['date'])) + super().__init__(url=package_url) + # Sort tarballs by upload date + sorted(tarballs, key=lambda v: int(v['date'])) + self.tarballs = tarballs + + def get_versions(self) -> Sequence[str]: + for archive in self.tarballs: + yield get_version(archive['archive']) + + def get_default_release(self) -> str: + # It's the most recent, so for this loader, it's the last one + return get_version(self.tarballs[-1]['archive']) + + def get_artifacts(self, version: str) -> Generator[ + Tuple[str, str, Dict], None, None]: + for a_metadata in self.tarballs: + url = a_metadata['archive'] + filename = path.split(url)[-1] + yield filename, url, a_metadata + + def fetch_artifact_archive( + self, artifact_uri: str, dest: str) -> Tuple[str, Dict]: + return download(artifact_uri, dest=dest) + + def build_revision( + self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + + normalized_date = normalize_timestamp(int(a_metadata['date'])) return { - 'metadata': { - 'package': { - 'date': package_version_data['date'], - 'archive': package_version_data['archive'], - }, - }, - 'date': normalize_date, - 'committer_date': normalize_date, + 'message': self.REVISION_MESSAGE, + 'date': normalized_date, 'author': self.SWH_PERSON, 'committer': self.SWH_PERSON, - 'type': 'tar', - 'message': self.REVISION_MESSAGE, - 'directory': directory.hash, - 'synthetic': True, + 'committer_date': normalized_date, 'parents': [], - } - - def store_data(self): - """Store fetched data in the database. - - """ - self.maybe_load_contents(self.package_contents) - self.maybe_load_directories(self.package_directories) - self.maybe_load_revisions(self.package_revisions) - - if self.done: - self.generate_and_load_snapshot() - self.flush() - - def generate_and_load_snapshot(self): - """Generate and load snapshot for the package visit. - - """ - branches = {} - for version_data in self.all_version_data: - branch_name = self.find_branch_name(version_data['archive']) - - target = self.target_from_version(version_data['id']) - branches[branch_name] = target - branches = self.find_head(branches, branch_name, - version_data['date']) - - if not target: - self._visit_status = 'partial' - - snapshot = { - 'branches': branches, - } - - snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) - self.maybe_load_snapshot(snapshot) - - def find_branch_name(self, url): - """Extract branch name from tarball url - - Args: - url (str): Tarball URL - - Returns: - byte: Branch name - - Example: - For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz - - >>> find_branch_name(url) - b'release/8sync-0.2.0' - - """ - branch_name = '' - filename = os.path.basename(url) - filename_parts = filename.split(".") - if len(filename_parts) > 1 and filename_parts[-2] == 'tar': - for part in filename_parts[:-2]: - branch_name += '.' + part - elif len(filename_parts) > 1 and filename_parts[-1] == 'zip': - for part in filename_parts[:-1]: - branch_name += '.' + part - - return (('release/%s') % branch_name[1:]).encode('ascii') - - def find_head(self, branches, branch_name, timestamp): - """Make branch head. - - Checks if the current version is the latest version. Make it as head - if it is the latest version. - - Args: - branches (dict): Branches for the focused package. - branch_name (str): Branch name - - Returns: - dict: Branches for the focused package - - """ - if self.latest_timestamp == int(timestamp): - branches[b'HEAD'] = { - 'target_type': 'alias', - 'target': branch_name, - } - return branches - - def target_from_version(self, revision_id): - return { - 'target': revision_id, - 'target_type': 'revision', - } if revision_id else None - - def load_status(self): - return { - 'status': self._load_status, + 'metadata': { + 'package': { + 'date': a_metadata['date'], + 'archive': a_metadata['archive'], + }, + }, } - - def visit_status(self): - return self._visit_status - - def cleanup(self): - """Clean up temporary disk use after downloading and extracting - package tarballs. - - """ - if self.debug: - self.log.warning('%s Will not clean up temp dir %s' % ( - DEBUG_MODE, self.temp_directory - )) - return - if os.path.exists(self.temp_directory): - self.log.debug('Clean up %s' % self.temp_directory) - shutil.rmtree(self.temp_directory) diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py index 2d54f11..ee334dd 100644 --- a/swh/loader/package/tests/common.py +++ b/swh/loader/package/tests/common.py @@ -1,32 +1,80 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -import os.path +from os import path +from urllib.parse import urlparse -RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') +from swh.model.hashutil import hash_to_bytes, hash_to_hex -package = '8sync' -package_url = 'https://ftp.gnu.org/gnu/8sync/' +DATADIR = path.join(path.abspath(path.dirname(__file__)), 'resources') -tarball = [{'date': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz'}] +def get_response_cb(request, context): + """Mount point callback to fetch on disk the content of a request -def init_test_data(mock_tarball_request): - """Initialize the loader with the mock of the tarballs + Args: + request (requests.Request): Object requests + context (requests.Context): Object holding requests metadata + information (headers, etc...) + + Returns: + File descriptor on the on disk file to read from the test context """ - for version in tarball: - tarball_url = version['archive'] - tarball_filename = tarball_url.split('/')[-1] - tarball_filepath = os.path.join(RESOURCES_PATH, 'tarballs', - tarball_filename) - with open(tarball_filepath, mode='rb') as tarball_file: - tarball_content = tarball_file.read() - mock_tarball_request.get( - tarball_url, content=tarball_content, - headers={'content-length': str(len(tarball_content))}) + url = urlparse(request.url) + dirname = url.hostname # pypi.org | files.pythonhosted.org + # url.path: pypi//json -> local file: pypi__json + filename = url.path[1:].replace('/', '_') + filepath = path.join(DATADIR, dirname, filename) + fd = open(filepath, 'rb') + context.headers['content-length'] = str(path.getsize(filepath)) + return fd + + +def decode_target(target): + if not target: + return target + target_type = target['target_type'] + + if target_type == 'alias': + decoded_target = target['target'].decode('utf-8') + else: + decoded_target = hash_to_hex(target['target']) + + return { + 'target': decoded_target, + 'target_type': target_type + } + + +def check_snapshot(expected_snapshot, expected_branches, storage): + """Check for snapshot match. + + Provide the hashes as hexadecimal, the conversion is done + within the method. + + Args: + expected_snapshot (Union[str, dict]): Either the snapshot + identifier or the full + snapshot + expected_branches ([dict]): expected branches or nothing is + the full snapshot is provided + + """ + if isinstance(expected_snapshot, dict) and not expected_branches: + expected_snapshot_id = expected_snapshot['id'] + expected_branches = expected_snapshot['branches'] + else: + expected_snapshot_id = expected_snapshot + + snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) + assert snap is not None + + branches = { + branch.decode('utf-8'): decode_target(target) + for branch, target in snap['branches'].items() + } + assert expected_branches == branches diff --git a/swh/loader/package/tests/resources/tarballs/8sync-0.1.0.tar.gz b/swh/loader/package/tests/resources/ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz similarity index 100% rename from swh/loader/package/tests/resources/tarballs/8sync-0.1.0.tar.gz rename to swh/loader/package/tests/resources/ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py index c283075..e65c70a 100644 --- a/swh/loader/package/tests/test_gnu.py +++ b/swh/loader/package/tests/test_gnu.py @@ -1,219 +1,167 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest import os -import requests_mock -from swh.loader.package.gnu import GNULoader -from swh.loader.core.tests import BaseLoaderStorageTest -from swh.loader.package.tests.common import ( - package, package_url, - tarball, init_test_data -) - -_LOADER_TESTS_CONFIG = { - 'content_packet_size': 10000, - 'content_packet_size_bytes': 104857600, - 'content_size_limit': 104857600, - 'debug': False, - 'directory_packet_size': 25000, - 'occurrence_packet_size': 100000, - 'release_packet_size': 100000, - 'revision_packet_size': 100000, - 'send_contents': True, - 'send_directories': True, - 'send_releases': True, - 'send_revisions': True, - 'send_snapshot': True, - 'storage': {'args': {}, 'cls': 'memory'}, - 'temp_directory': '/tmp/swh.loader.gnu/' +import re + +from swh.model.hashutil import hash_to_bytes + +from swh.loader.package.gnu import GNULoader, get_version +from swh.loader.package.tests.common import get_response_cb, check_snapshot + + +def test_get_version(): + """From url to branch name should yield something relevant? + + """ + for url, expected_branchname in [ + ('https://gnu.org/sthg/info-2.1.0.tar.gz', 'info-2.1.0'), + ('https://gnu.org/sthg/info-2.1.2.zip', 'info-2.1.2'), + ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), + ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', 'DLDF-1.1.4'), + ('https://sthg.org/gnu/DLDF-1.1.4-1.1.5.tar.gz', + 'DLDF-1.1.4-1.1.5'), + ]: + actual_branchname = get_version(url) + + assert actual_branchname == expected_branchname + + +_expected_new_contents_first_visit = [ + 'e9258d81faf5881a2f96a77ba609396f82cb97ad', + '1170cf105b04b7e2822a0e09d2acf71da7b9a130', + 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', + '0057bec9b5422aff9256af240b177ac0e3ac2608', + '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', + '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', + '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', + 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', + 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', + 'd64e64d4c73679323f8d4cde2643331ba6c20af9', + '7a756602914be889c0a2d3952c710144b3e64cb0', + '84fb589b554fcb7f32b806951dcf19518d67b08f', + '8624bcdae55baeef00cd11d5dfcfa60f68710a02', + 'e08441aeab02704cfbd435d6445f7c072f8f524e', + 'f67935bc3a83a67259cda4b2d43373bd56703844', + '809788434b433eb2e3cfabd5d591c9a659d5e3d8', + '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', + 'b99fec102eb24bffd53ab61fc30d59e810f116a2', + '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', + 'f0c97052e567948adf03e641301e9983c478ccff', + '7fb724242e2b62b85ca64190c31dcae5303e19b3', + '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', + '7350628ccf194c2c3afba4ac588c33e3f3ac778d', + '0bb892d9391aa706dc2c3b1906567df43cbe06a2', + '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', + '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', + '3046e5d1f70297e2a507b98224b6222c9688d610', + '1572607d456d7f633bc6065a2b3048496d679a31', +] + +_expected_new_directories_first_visit = [ + 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', + '263be23b4a8101d3ad0d9831319a3e0f2b065f36', + '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', + '4db0a3ecbc976083e2dac01a62f93729698429a3', + 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', + 'eca971d346ea54d95a6e19d5051f900237fafdaa', + '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', +] + +_expected_new_revisions_first_visit = { + '44183488c0774ce3c957fa19ba695cf18a4a42b3': + '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' } +_expected_branches_first_visit = { + 'HEAD': { + 'target_type': 'alias', + 'target': 'releases/8sync-0.1.0', + }, + 'releases/8sync-0.1.0': { + 'target_type': 'revision', + 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', + }, +} -class GNULoaderTest(GNULoader): - def parse_config_file(self, *args, **kwargs): - return _LOADER_TESTS_CONFIG - - -@requests_mock.Mocker() -class TestGNULoader(unittest.TestCase, BaseLoaderStorageTest): - - _expected_new_contents_first_visit = [ - 'e9258d81faf5881a2f96a77ba609396f82cb97ad', - '1170cf105b04b7e2822a0e09d2acf71da7b9a130', - 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', - '0057bec9b5422aff9256af240b177ac0e3ac2608', - '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', - '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', - '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', - 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', - 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', - 'd64e64d4c73679323f8d4cde2643331ba6c20af9', - '7a756602914be889c0a2d3952c710144b3e64cb0', - '84fb589b554fcb7f32b806951dcf19518d67b08f', - '8624bcdae55baeef00cd11d5dfcfa60f68710a02', - 'e08441aeab02704cfbd435d6445f7c072f8f524e', - 'f67935bc3a83a67259cda4b2d43373bd56703844', - '809788434b433eb2e3cfabd5d591c9a659d5e3d8', - '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', - 'b99fec102eb24bffd53ab61fc30d59e810f116a2', - '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', - 'f0c97052e567948adf03e641301e9983c478ccff', - '7fb724242e2b62b85ca64190c31dcae5303e19b3', - '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', - '7350628ccf194c2c3afba4ac588c33e3f3ac778d', - '0bb892d9391aa706dc2c3b1906567df43cbe06a2', - '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', - '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', - '3046e5d1f70297e2a507b98224b6222c9688d610', - '1572607d456d7f633bc6065a2b3048496d679a31', - ] - - _expected_new_directories_first_visit = [ - 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', - '263be23b4a8101d3ad0d9831319a3e0f2b065f36', - '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', - '4db0a3ecbc976083e2dac01a62f93729698429a3', - 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', - 'eca971d346ea54d95a6e19d5051f900237fafdaa', - '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', - ] - - _expected_new_revisions_first_visit = { - '44183488c0774ce3c957fa19ba695cf18a4a42b3': - '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' - } - - _expected_branches_first_visit = { - 'HEAD': { - 'target': 'release/8sync-0.1.0', - 'target_type': 'alias' - }, - 'release/8sync-0.1.0': { - 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', - 'target_type': 'revision' - }, - } - _expected_new_snapshot_first_visit = '2ae491bbaeef7351641997d1b9193aa2a67d26bc' # noqa - - _expected_new_contents_invalid_origin = [] - _expected_new_directories_invalid_origin = [] - - @classmethod - def setUpClass(cls): - cls.reset_loader() - - @classmethod - def reset_loader(cls): - cls.loader = GNULoaderTest() - cls.storage = cls.loader.storage - - def reset_loader_counters(self): - counters_reset = dict.fromkeys(self.loader.counters.keys(), 0) - self.loader.counters.update(counters_reset) - - def test_gnu_loader_first_visit_success(self, mock_tarball_request): - """In this scenario no visit as taken place prior to this visit. - - """ - self.reset_loader() - init_test_data(mock_tarball_request) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertCountContents(len(self._expected_new_contents_first_visit)) - self.assertContentsContain(self._expected_new_contents_first_visit) - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_first_visit)) - - self.assertCountDirectories(len(self._expected_new_directories_first_visit)) # noqa - self.assertDirectoriesContain(self._expected_new_directories_first_visit) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_first_visit)) - - self.assertCountRevisions(1, '1 artifact releases so 1 revisions should be created') # noqa - self.assertRevisionsContain(self._expected_new_revisions_first_visit) - self.assertEqual(self.loader.counters['revisions'], - len(self._expected_new_revisions_first_visit)) - - self.assertCountReleases(0, 'No release is created by the loader') - self.assertEqual(self.loader.counters['releases'], 0) - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - self.assertSnapshotEqual(self._expected_new_snapshot_first_visit, - self._expected_branches_first_visit) - - self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) - self.assertEqual(self.loader.visit_status(), 'full') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) - - def test_gnu_loader_origin_invalid(self, mock_tarball_request): - """In this scenario, tarball link is not valid and will give 404 error - - """ - self.reset_loader() - mock_tarball_request.get( - 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - text='Not Found', status_code=404) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertContentsContain(self._expected_new_contents_invalid_origin) - self.assertCountContents(len(self._expected_new_contents_invalid_origin)) # noqa - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_invalid_origin)) - - self.assertDirectoriesContain(self._expected_new_directories_invalid_origin) # noqa - self.assertCountDirectories(len(self._expected_new_directories_invalid_origin)) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_invalid_origin)) - - self.assertCountRevisions(0, '0 releases so 0 revisions should be created') # noqa - - self.assertEqual(self.loader.counters['releases'], 0) - self.assertCountReleases(0, 'No release is created by the loader') - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - - self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) - self.assertEqual(self.loader.visit_status(), 'partial') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) - - def test_gnu_loader_second_visit(self, mock_tarball_request): - """This scenario makes use of the incremental nature of the loader. - - In this test there is no change from the first visit. So same result - as first visit. - """ - self.reset_loader() - init_test_data(mock_tarball_request) - self.loader.load(package, package_url, tarballs=tarball) - - self.assertCountContents(len(self._expected_new_contents_first_visit)) - self.assertContentsContain(self._expected_new_contents_first_visit) - self.assertEqual(self.loader.counters['contents'], - len(self._expected_new_contents_first_visit)) - - self.assertCountDirectories(len(self._expected_new_directories_first_visit)) # noqa - self.assertDirectoriesContain(self._expected_new_directories_first_visit) # noqa - self.assertEqual(self.loader.counters['directories'], - len(self._expected_new_directories_first_visit)) - - self.assertCountRevisions(1, '1 artifact releases so 1 revisions should be created') # noqa - self.assertRevisionsContain(self._expected_new_revisions_first_visit) - self.assertEqual(self.loader.counters['revisions'], - len(self._expected_new_revisions_first_visit)) - - self.assertCountReleases(0, 'No release is created by the loader') - self.assertEqual(self.loader.counters['releases'], 0) - - self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') - self.assertSnapshotEqual(self._expected_new_snapshot_first_visit, - self._expected_branches_first_visit) - - self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) - self.assertEqual(self.loader.visit_status(), 'full') - - self.assertFalse(os.path.exists(self.loader.temp_directory)) +# hash is different then before as we changed the snapshot +# gnu used to use `release/` (singular) instead of plural +_expected_new_snapshot_first_visit_id = '69f368defd75ddd3972d2a687f4cc565c7aa58d9' # noqa + + +# def test_release_artifact_not_found(requests_mock): +# package = '8sync' +# package_url = 'https://ftp.gnu.org/gnu/8sync/' +# tarballs = [{ +# 'date': '944729610', +# 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', +# }] + +# loader = GNULoader(package, package_url, tarballs) +# requests_mock.get(re.compile('https://'), status_code=404) + +# assert actual_load_status == {'status': 'uneventful'} +# stats = loader.storage.stat_counters() + +# assert { +# 'content': 0, +# 'directory': 0, +# 'origin': 1, +# 'origin_visit': 1, +# 'person': 0, +# 'release': 0, +# 'revision': 0, +# 'skipped_content': 0, +# 'snapshot': 0, +# } == stats + + +def test_release_artifact_no_prior_visit(requests_mock): + """With no prior visit, load a pypi project ends up with 1 snapshot + + """ + assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini + package = '8sync' + package_url = 'https://ftp.gnu.org/gnu/8sync/' + tarballs = [{ + 'date': '944729610', + 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + }] + + loader = GNULoader(package, package_url, tarballs) + requests_mock.get(re.compile('https://'), body=get_response_cb) + + actual_load_status = loader.load() + + assert actual_load_status == {'status': 'eventful'} + + stats = loader.storage.stat_counters() + + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) + assert list(loader.storage.content_missing_per_sha1(expected_contents))\ + == [] + + expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) + assert list(loader.storage.revision_missing(expected_revs)) == [] + + check_snapshot( + _expected_new_snapshot_first_visit_id, + _expected_branches_first_visit, + storage=loader.storage) diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py index 7dd1c67..a3f0e87 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -1,371 +1,313 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re from os import path -from urllib.parse import urlparse import pytest from swh.core.tarball import uncompress -from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.hashutil import hash_to_bytes from swh.loader.package.pypi import ( PyPILoader, pypi_api_url, pypi_info, author, sdist_parse ) -DATADIR = path.join(path.abspath(path.dirname(__file__)), 'resources') +from swh.loader.package.tests.common import ( + get_response_cb, DATADIR, check_snapshot +) def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME') with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_info_failure(requests_mock): """Failure to fetch info/release information should raise""" project_url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(info_url, status_code=status_code) with pytest.raises(ValueError) as e0: pypi_info(project_url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( info_url, status_code ) def test_pypi_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' requests_mock.get(info_url, text='{"version": "0.0.1"}') actual_info = pypi_info(url) assert actual_info == { 'version': '0.0.1', } @pytest.mark.fs def test_sdist_parse(tmp_path): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_sdist = sdist_parse(uncompressed_archive_path) expected_sdist = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_sdist == expected_sdist @pytest.mark.fs def test_sdist_parse_failures(tmp_path): """Parsing inexistant path/archive/PKG-INFO yield None""" # inexistant first level path assert sdist_parse('/something-inexistant') is None # inexistant second level path (as expected by pypi archives) assert sdist_parse(tmp_path) is None # inexistant PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert sdist_parse(tmp_path) is None # LOADER SCENARIO # - -def get_response_cb(request, context): - """""" - url = urlparse(request.url) - dirname = url.hostname # pypi.org | files.pythonhosted.org - # url.path: pypi//json -> local file: pypi__json - filename = url.path[1:].replace('/', '_') - filepath = path.join(DATADIR, dirname, filename) - fd = open(filepath, 'rb') - context.headers['content-length'] = str(os.path.getsize(filepath)) - return fd - # "edge" cases (for the same origin) # def test_no_release_artifact(requests_mock): pass # no release artifact: # {visit full, status: uneventful, no contents, etc...} # problem during loading: # {visit: partial, status: uneventful, no snapshot} # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # def test_release_artifact_no_prior_visit(requests_mock): """With no prior visit, load a pypi project ends up with 1 snapshot """ assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini loader = PyPILoader('https://pypi.org/project/0805nexter') - requests_mock.get(re.compile('https://'), - body=get_response_cb) + requests_mock.get(re.compile('https://'), body=get_response_cb) actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } check_snapshot( 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a', expected_branches, storage=loader.storage) # self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) # self.assertEqual(self.loader.visit_status(), 'full') # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, new artifact # {visit full, status full, new snapshot with shared history as prior snapshot} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} - - -def decode_target(target): - if not target: - return target - target_type = target['target_type'] - - if target_type == 'alias': - decoded_target = target['target'].decode('utf-8') - else: - decoded_target = hash_to_hex(target['target']) - - return { - 'target': decoded_target, - 'target_type': target_type - } - - -def check_snapshot(expected_snapshot, expected_branches, storage): - """Check for snapshot match. - - Provide the hashes as hexadecimal, the conversion is done - within the method. - - Args: - expected_snapshot (Union[str, dict]): Either the snapshot - identifier or the full - snapshot - expected_branches ([dict]): expected branches or nothing is - the full snapshot is provided - - """ - if isinstance(expected_snapshot, dict) and not expected_branches: - expected_snapshot_id = expected_snapshot['id'] - expected_branches = expected_snapshot['branches'] - else: - expected_snapshot_id = expected_snapshot - - snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) - assert snap is not None - - branches = { - branch.decode('utf-8'): decode_target(target) - for branch, target in snap['branches'].items() - } - assert expected_branches == branches