diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ swh-loader-pypi ==================== -SWH PyPi loader's source code repository +SWH PyPI loader's source code repository diff --git a/debian/changelog b/debian/changelog --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -swh-loader-pypy (0.0.1-1) unstable; urgency=low +swh-loader-pypi (0.0.1-1) unstable; urgency=low * Initial bootstrap diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -5,7 +5,10 @@ Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, + python3-arrow, python3-nose, + python3-pkginfo, + python3-requests, python3-setuptools, python3-swh.core, python3-swh.storage, @@ -21,4 +24,4 @@ python3-swh.loader.core, python3-swh.storage, ${misc:Depends}, ${python3:Depends} -Description: Software Heritage PyPi Loader +Description: Software Heritage PyPI Loader diff --git a/debian/rules b/debian/rules --- a/debian/rules +++ b/debian/rules @@ -9,3 +9,4 @@ override_dh_install: dh_install rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/__init__.py + rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/loader/__init__.py diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ setuptools vcversioner +requests +arrow +pkginfo diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name='swh.loader.pypi', - description='Software Heritage PyPi Loader', + description='Software Heritage PyPI Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/source/swh-loader-pypi.git', @@ -35,6 +35,6 @@ install_requires=parse_requirements() + parse_requirements('swh'), test_requires=parse_requirements('test'), setup_requires=['vcversioner'], - vcversioner={}, + vcversioner={'version_module_paths': ['swh/loader/pypi/_version.py']}, include_package_data=True, ) diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py --- a/swh/loader/__init__.py +++ b/swh/loader/__init__.py @@ -0,0 +1 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) diff --git a/swh/loader/pypi/.gitignore b/swh/loader/pypi/.gitignore new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/.gitignore @@ -0,0 +1 @@ +_version.py diff --git a/swh/loader/pypi/client.py b/swh/loader/pypi/client.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/client.py @@ -0,0 +1,468 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import hashlib +import logging +import os +import requests +import shutil + +from .converters import info, author + +from pkginfo import UnpackedSDist + +from swh.core import tarball +from swh.model import hashutil + +try: + from swh.loader.pypi._version import __version__ +except ImportError: + __version__ = 'devel' + + +def convert_to_hex(d): + """Convert a flat dictionary with bytes in values to the same dictionary + with hex as values. + + Args: + dict: flat dictionary with sha bytes in their values. + + Returns: + Mirror dictionary with values as string hex. + + """ + if not d: + return d + + checksums = {} + for key, h in d.items(): + if isinstance(h, bytes): + checksums[key] = hashutil.hash_to_hex(h) + else: + checksums[key] = h + + return checksums + + +def _to_dict(pkginfo): + """Given a pkginfo parsed structure, convert it to a dict. + + Args: + pkginfo (UnpackedSDist): The sdist parsed structure + + Returns: + parsed structure as a dict + + """ + m = {} + for k in pkginfo: + m[k] = getattr(pkginfo, k) + return m + + +def _project_pkginfo(dir_path): + """Given an uncompressed path holding the pkginfo file, returns a + pkginfo parsed structure as a dict. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + + dir_path (str): Path to the uncompressed directory + representing a release artifact from pypi. + + Returns: + the pkginfo parsed structure as a dict if any or None if + none was present. + + """ + # Retrieve the root folder of the archive + project_dirname = os.listdir(dir_path)[0] + pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') + if not os.path.exists(pkginfo_path): + return None + pkginfo = UnpackedSDist(pkginfo_path) + return _to_dict(pkginfo) + + +class PyPIClient: + """PyPI client in charge of discussing with the pypi server. + + Args: + base_url (str): PyPI instance's base url + temp_directory (str): Path to the temporary disk location used + for uncompressing the release artifacts + + cache (bool): Use an internal cache to keep the archives on + disk. Default is not to use it. + cache_dir (str): cache's disk location (relevant only with + `cache` to True) + + Those last 2 parameters are not for production use. + + """ + def __init__(self, base_url='https://pypi.org/pypi', + temp_directory=None, cache=False, cache_dir=None): + self.version = __version__ + self.base_url = base_url + self.temp_directory = temp_directory + + self.do_cache = cache + if self.do_cache: + self.cache_dir = cache_dir + self.cache_raw_dir = os.path.join(cache_dir, 'archives') + os.makedirs(self.cache_raw_dir, exist_ok=True) + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage PyPI Loader (%s)' % ( + __version__ + ) + } + } + + def _save_response(self, response, project=None): + """Log the response from a server request to a cache dir. + + Args: + response (Response): full server response + cache_dir (str): system path for cache dir + + Returns: + nothing + + """ + import gzip + from json import dumps + datepath = arrow.utcnow().isoformat() + name = '%s.gz' % datepath if project is None else '%s-%s.gz' % ( + project, datepath) + fname = os.path.join(self.cache_dir, name) + with gzip.open(fname, 'w') as f: + f.write(bytes( + dumps(response.json()), + 'utf-8' + )) + + def _save_raw(self, filepath): + """In cache mode, backup the filepath to self.cache_raw_dir + + Args: + filepath (str): Path of the file to save + + """ + _filename = os.path.basename(filepath) + _archive = os.path.join(self.cache_raw_dir, _filename) + shutil.copyfile(filepath, _archive) + + def _get_raw(self, filepath): + """In cache mode, we try to retrieve the cached file. + + """ + _filename = os.path.basename(filepath) + _archive = os.path.join(self.cache_raw_dir, _filename) + if not os.path.exists(_archive): + return None + shutil.copyfile(_archive, filepath) + return filepath + + def _get(self, url, project=None): + """Get query to the url. + + Args: + url (str): Url + + Raises: + ValueError in case of failing to query + + Returns: + Response as dict if ok + + """ + response = self.session.get(url, **self.params) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + + if self.do_cache: + self._save_response(response, project=project) + + return response.json() + + def info(self, project_url, project=None): + """Given a metadata project url, retrieve the raw json response + + Args: + project_url (str): Project's pypi to retrieve information + + Returns: + Main project information as dict. + + """ + return self._get(project_url, project=project) + + def release(self, project, release): + """Given a project and a release name, retrieve the raw information + for said project's release. + + Args: + project (str): Project's name + release (dict): Release information + + Returns: + Release information as dict + + """ + release_url = '%s/%s/%s/json' % (self.base_url, project, release) + return self._get(release_url, project=project) + + def prepare_release_artifacts(self, project, version, release_artifacts): + """For a given project's release version, fetch and prepare the + associated release artifacts. + + Args: + project (str): PyPI Project + version (str): Release version + release_artifacts ([dict]): List of source distribution + release artifacts + + Yields: + tuple (artifact, filepath, uncompressed_path, pkginfo) where: + + - artifact (dict): release artifact's associated info + - release (dict): release information + - filepath (str): Local artifact's path + - uncompressed_archive_path (str): uncompressed archive path + - pkginfo (dict): package information or None if none found + + """ + for artifact in release_artifacts: + release = { + 'name': version, + 'message': artifact.get('comment_text', ''), + } + artifact = { + 'sha256': artifact['digests']['sha256'], + 'size': artifact['size'], + 'filename': artifact['filename'], + 'url': artifact['url'], + 'date': artifact['upload_time'], + } + yield self.prepare_release_artifact(project, release, artifact) + + def prepare_release_artifact(self, project, release, artifact): + """For a given release project, fetch and prepare the associated + artifact. + + This: + - fetches the artifact + - checks the size, hashes match + - uncompress the artifact locally + - computes the swh hashes + - returns the associated information for the artifact + + Args: + project (str): Project's name + release (dict): Release information + artifact (dict): Release artifact information + + Returns: + tuple (artifact, filepath, uncompressed_path, pkginfo) where: + + - release (dict): Release information (name, message) + - artifact (dict): release artifact's information + - filepath (str): Local artifact's path + - uncompressed_archive_path (str): uncompressed archive path + - pkginfo (dict): package information or None if none found + + """ + version = release['name'] + logging.debug('Release version: %s' % version) + path = os.path.join(self.temp_directory, project, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, artifact['filename']) + logging.debug('Artifact local path: %s' % filepath) + + _filepath = None + if self.do_cache: + _filepath = self._get_raw(filepath) + + if not _filepath: # no cache hit, we fetch from pypi + url = artifact['url'] + r = self.session.get(url, **self.params) + status = r.status_code + if status != 200: + if status == 404: + raise ValueError("Project '%s' not found" % url) + else: + msg = "Fail to query '%s'\nCode: %s\nDetails: %s" % ( + url, r.status_code, r.content) + raise ValueError(msg) + + _len = len(r.content) + if _len != artifact['size']: + raise ValueError('Error when checking size: %s != %s' % ( + artifact['size'], _len)) + + # checking digest and writing + h = hashlib.sha256() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(): + h.update(chunk) + f.write(chunk) + + actual_digest = h.hexdigest() + if actual_digest != artifact['sha256']: + raise ValueError( + '%s %s: Checksum mismatched: %s != %s' % ( + project, version, artifact['sha256'], actual_digest)) + + if self.do_cache: + self._save_raw(filepath) + + uncompress_path = os.path.join(path, 'uncompress') + os.makedirs(uncompress_path, exist_ok=True) + + nature = tarball.uncompress(filepath, uncompress_path) + + hashes = hashutil.hash_path(filepath) + hashes.pop('length') # 'size' entry is already referenced + artifact_hashes = convert_to_hex(hashes) + artifact['archive_type'] = nature + artifact.update(artifact_hashes) + pkginfo = _project_pkginfo(uncompress_path) + return release, artifact, filepath, uncompress_path, pkginfo + + +class PyPIProject: + """PyPI project representation + + This allows to extract information for the: + - project, either the latest information (from the last revision) + - either the information for a given release (artifact) + - Symmetrically for the release artifact author information + + This also fetches and uncompress the associated release artifacts. + + """ + def __init__(self, client, project, project_metadata_url, data=None): + self.client = client + self.project = project + self.project_metadata_url = project_metadata_url + if data: + self.data = data + else: + self.data = client.info(project_metadata_url, project) + + self.last_version = self.data['info']['version'] + self.cache = { + self.last_version: self.data + } + + def _data(self, release_name=None): + """Fetch data per release and cache it. Returns the cache retrieved + data if already fetched. + + """ + if release_name: + data = self.cache.get(release_name) + if not data: + data = self.client.release(self.project, release_name) + self.cache[release_name] = data + else: + data = self.data + return data + + def info(self, release_name=None): + """Compute release information for provided release (or latest one). + + """ + return info(self._data(release_name)) + + def _filter_release_artifacts(self, version, releases, known_artifacts): + """Filter not already known sdist (source distribution) release. + + There can be multiple 'package_type' (sdist, bdist_egg, + bdist_wheel, bdist_rpm, bdist_msi, bdist_wininst, ...), we are + only interested in source distribution (sdist), others bdist* + are binary + + Args: + version (str): Release name or version + releases (dict/[dict]): Full release object (or a list of) + known_artifacts ([tuple]): List of known releases (tuple filename, + sha256) + + Yields: + an unknown release artifact + + """ + if not releases: + return [] + if not isinstance(releases, list): + releases = [releases] + for artifact in releases: + name = artifact['filename'] + sha256 = artifact['digests']['sha256'] + if (name, sha256) in known_artifacts: + logging.debug('artifact (%s, %s) already seen for release %s, skipping' % ( # noqa + name, sha256, version)) + continue + if artifact['packagetype'] != 'sdist': + continue + yield artifact + + def _cleanup_release_artifacts(self, archive_path, directory_path): + """Clean intermediary files which no longer needs to be present. + + """ + if directory_path and os.path.exists(directory_path): + logging.debug('Clean up uncompressed archive path %s' % ( + directory_path, )) + shutil.rmtree(directory_path) + + if archive_path and os.path.exists(archive_path): + logging.debug('Clean up archive %s' % archive_path) + os.unlink(archive_path) + + def releases(self, known_artifacts): + """Fetch metadata/data per release (if new release artifact detected) + + For new release artifact, this: + - downloads and uncompresses the release artifacts. + - yields the (release info, author info, release, dir_path) + - Clean up the intermediary fetched artifact files + + Args: + known_artifacts (tuple): artifact name, artifact sha256 hash + + Yields: + tuple (version, release_info, release, uncompressed_path) where: + + - project_info (dict): release's associated version info + - author (dict): Author information for the release + - artifact (dict): Release artifact information + - release (dict): release metadata + - uncompressed_path (str): Path to uncompressed artifact + + """ + releases_dict = self.data['releases'] + for version, releases in releases_dict.items(): + releases = self._filter_release_artifacts( + version, releases, known_artifacts) + releases = self.client.prepare_release_artifacts( + self.project, version, releases) + for release, artifact, archive, dir_path, pkginfo in releases: + if pkginfo is None: # fallback to pypi api metadata + msg = '%s %s: No PKG-INFO detected, skipping' % ( # noqa + self.project, version) + logging.warn(msg) + continue + yield pkginfo, author(pkginfo), release, artifact, dir_path + self._cleanup_release_artifacts(archive, dir_path) diff --git a/swh/loader/pypi/converters.py b/swh/loader/pypi/converters.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/converters.py @@ -0,0 +1,70 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def info(data): + """Given a dict of a PyPI project information, returns a project + subset. + + Args: + data (dict): Representing either artifact information or + release information. + + Returns: + A dict subset of project information. + + """ + _info = data['info'] + default = { + 'home_page': _info['home_page'], + 'description': _info['description'], + 'summary': _info['summary'], + 'license': _info['license'], + 'package_url': _info['package_url'], + 'project_url': _info['project_url'], + 'upstream': None, + } + + project_urls = _info.get('project_urls') + if project_urls: + homepage = project_urls.get('Homepage') + if homepage: + default['upstream'] = homepage + + return default + + +def author(data): + """Given a dict of project/release artifact information (coming from + PyPI), returns an author subset. + + Args: + data (dict): Representing either artifact information or + release information. + + Returns: + swh-model dict representing a person. + + """ + name = data['author'] + email = data['author_email'] + if email: + fullname = '%s <%s>' % (name, email) + else: + fullname = name + + if not fullname: + return {'fullname': b'', 'name': None, 'email': None} + + if fullname: + fullname = fullname.encode('utf-8') + + if name: + name = name.encode('utf-8') + + if email: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/loader.py @@ -0,0 +1,209 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import os +import shutil + +from tempfile import mkdtemp + +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import SWHLoader +from swh.model.from_disk import Directory +from swh.model.identifiers import ( + revision_identifier, snapshot_identifier, + identifier_to_bytes, normalize_timestamp +) + +from .client import PyPIClient, PyPIProject + + +TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' +DEBUG_MODE = '** DEBUG MODE **' + + +class PyPILoader(SWHLoader): + CONFIG_BASE_FILENAME = 'loader/pypi' + ADDITIONAL_CONFIG = { + 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), + 'cache': ('bool', False), + 'cache_dir': ('str', ''), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + + def __init__(self, client=None): + super().__init__(logging_class='swh.loader.pypi.PyPILoader') + self.origin_id = None + if not client: + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + self.pypi_client = PyPIClient( + temp_directory=self.temp_directory, + cache=self.config['cache'], + cache_dir=self.config['cache_dir']) + else: + self.temp_directory = client.temp_directory + self.pypi_client = client + self.debug = self.config['debug'] + + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def cleanup(self): + """Clean up temporary disk use + + """ + if self.debug: + self.log.warn('%s Will not clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) + + def prepare_origin_visit(self, project_name, origin_url, + origin_metadata_url=None): + """Prepare the origin visit information + + Args: + project_name (str): Project's simple name + origin_url (str): Project's main url + origin_metadata_url (str): Project's metadata url + + """ + self.origin = { + 'url': origin_url, + 'type': 'pypi', + } + self.visit_date = None # loader core will populate it + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + """Keep reference to the origin url (project) and the + project metadata url + + Args: + project_name (str): Project's simple name + origin_url (str): Project's main url + origin_metadata_url (str): Project's metadata url + + """ + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + self.project = PyPIProject(self.pypi_client, self.project_name, + self.origin_metadata_url) + + def _known_artifacts(self, last_snapshot): + """Retrieve the known releases/artifact for the origin_id. + + Args + snapshot (dict): Last snapshot for the visit + + Returns: + tuple artifact's filename, artifact's sha256 + + """ + revs = [rev['target'] for rev in last_snapshot['branches'].values()] + known_revisions = self.storage.revision_get(revs) + for revision in known_revisions: + artifact = revision['metadata']['original_artifact'] + yield artifact['filename'], artifact['sha256'] + + def _last_snapshot(self): + """Retrieve the last snapshot + + """ + return self.storage.snapshot_get_latest(self.origin_id) + + def fetch_data(self): + """(override) Fetch and collect swh objects. + + """ + last_snapshot = self._last_snapshot() + if last_snapshot: + self._snapshot = last_snapshot.copy() + known_artifacts = self._known_artifacts(self._snapshot) + else: + self._snapshot = { + 'branches': {} + } + known_artifacts = [] + + self._contents = [] + self._directories = [] + self._revisions = [] + + for project_info, author, release, artifact, dir_path in \ + self.project.releases(known_artifacts): + + dir_path = dir_path.encode('utf-8') + directory = Directory.from_disk(path=dir_path, data=True) + _objects = directory.collect() + + self._contents.extend(_objects['content'].values()) + self._directories.extend(_objects['directory'].values()) + date = normalize_timestamp( + int(arrow.get(artifact['date']).timestamp)) + + name = release['name'].encode('utf-8') + message = release['message'].encode('utf-8') + if message: + message = b'%s: %s' % (name, message) + else: + message = name + + _revision = { + 'synthetic': True, + 'metadata': { + 'original_artifact': artifact, + 'project': project_info, + }, + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'message': message, + 'directory': directory.hash, + 'parents': [], + 'type': 'tar', + } + _revision['id'] = identifier_to_bytes( + revision_identifier(_revision)) + self._revisions.append(_revision) + + branch_name = artifact['filename'].encode('utf-8') + self._snapshot['branches'][branch_name] = { + 'target': _revision['id'], + 'target_type': 'revision', + } + + self._snapshot['id'] = identifier_to_bytes( + snapshot_identifier(self._snapshot)) + + def store_data(self): + """(override) This sends collected objects to storage. + + """ + self.maybe_load_contents(self._contents) + self.maybe_load_directories(self._directories) + self.maybe_load_revisions(self._revisions) + self.maybe_load_snapshot(self._snapshot) diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/model.py @@ -0,0 +1,10 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import logging +import shutil + +from .converters import info, author diff --git a/swh/loader/pypi/tasks.py b/swh/loader/pypi/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.task import Task + +from .loader import PyPILoader + + +class LoadPyPITsk(Task): + task_queue = 'swh_loader_pypi' + + def run_task(self, project_name, project_url, project_metadata_url=None): + loader = PyPILoader() + loader.log = self.log + return loader.load(project_name, + project_url, + origin_metadata_url=project_metadata_url) diff --git a/swh/loader/__init__.py b/swh/loader/pypi/tests/__init__.py copy from swh/loader/__init__.py copy to swh/loader/pypi/tests/__init__.py diff --git a/swh/loader/pypi/tests/common.py b/swh/loader/pypi/tests/common.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/common.py @@ -0,0 +1,151 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import shutil +import os +import tempfile + +from nose.plugins.attrib import attr +from unittest import TestCase + +from swh.loader.pypi.client import PyPIClient, PyPIProject + + +RESOURCES_PATH = './swh/loader/pypi/tests/resources' + + +class PyPIClientWithCache(PyPIClient): + """Force the use of the cache to bypass pypi calls + + """ + def __init__(self, temp_directory, cache_dir): + super().__init__(temp_directory=temp_directory, + cache=True, cache_dir=cache_dir) + + +class LoaderNoStorage: + """Mixin class to inhibit the persistence (storage calls) and keep in + memory the data sent. + + """ + CONFIG_BASE_FILENAME = '' # do not provide a real path + ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://nowhere:5002/', # do not provide a real storage + } + }), + + # do not send any data to the storage + 'send_contents': ('bool', False), + 'send_directories': ('bool', False), + 'send_revisions': ('bool', False), + 'send_releases': ('bool', False), + 'send_snapshot': ('bool', False), + 'debug': ('bool', False), + } + + def __init__(self, client=None): + super().__init__(client=client) + self.all_contents = [] + self.all_directories = [] + self.all_revisions = [] + self.all_releases = [] + self.all_snapshots = [] + + # typed data + self.objects = { + 'content': self.all_contents, + 'directory': self.all_directories, + 'revision': self.all_revisions, + 'release': self.all_releases, + 'snapshot': self.all_snapshots + } + + def _add(self, type, l): + """Add without duplicates and keeping the insertion order. + + Args: + type (str): Type of objects concerned by the action + l ([object]): List of 'type' object + + """ + col = self.objects[type] + for o in l: + if o in col: + continue + col.extend([o]) + + def maybe_load_contents(self, all_contents): + self._add('content', all_contents) + + def maybe_load_directories(self, all_directories): + self._add('directory', all_directories) + + def maybe_load_revisions(self, all_revisions): + self._add('revision', all_revisions) + + def maybe_load_releases(self, releases): + raise ValueError('If called, the test must break.') + + def maybe_load_snapshot(self, snapshot): + self.objects['snapshot'].append(snapshot) + + def _store_origin_visit(self): + pass + + def open_fetch_history(self): + pass + + def close_fetch_history_success(self, fetch_history_id): + pass + + def close_fetch_history_failure(self, fetch_history_id): + pass + + def update_origin_visit(self, origin_id, visit, status): + pass + + # Override to do nothing at the end + def close_failure(self): + pass + + def close_success(self): + pass + + def pre_cleanup(self): + pass + + +@attr('fs') +class WithProjectTest(TestCase): + def setUp(self): + project = '0805nexter' + project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project) + + with open(project_metadata_file) as f: + data = json.load(f) + + temp_dir = tempfile.mkdtemp( + dir='/tmp/', prefix='swh.loader.pypi.tests-') + project_metadata_url = 'https://pypi.org/pypi/%s/json' % project + # Will use the pypi with cache + client = PyPIClientWithCache( + temp_directory=temp_dir, cache_dir=RESOURCES_PATH) + self.project = PyPIProject( + client=client, + project=project, + project_metadata_url=project_metadata_url, + data=data) + + self.data = data + self.temp_dir = temp_dir + self.project_name = project + + def tearDown(self): + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) diff --git a/swh/loader/pypi/tests/resources/0805nexter.json b/swh/loader/pypi/tests/resources/0805nexter.json new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/resources/0805nexter.json @@ -0,0 +1 @@ +{"info":{"author":"hgtkpython","author_email":"2868989685@qq.com","bugtrack_url":null,"classifiers":[],"description":"UNKNOWN","description_content_type":null,"docs_url":null,"download_url":"UNKNOWN","downloads":{"last_day":-1,"last_month":-1,"last_week":-1},"home_page":"http://www.hp.com","keywords":null,"license":"UNKNOWN","maintainer":null,"maintainer_email":null,"name":"0805nexter","package_url":"https://pypi.org/project/0805nexter/","platform":"UNKNOWN","project_url":"https://pypi.org/project/0805nexter/","project_urls":{"Download":"UNKNOWN","Homepage":"http://www.hp.com"},"release_url":"https://pypi.org/project/0805nexter/1.2.0/","requires_dist":null,"requires_python":null,"summary":"a simple printer of nested lest","version":"1.2.0"},"last_serial":1931736,"releases":{"1.1.0":[{"comment_text":"","digests":{"md5":"07fc93fc12821c1405c3483db88154af","sha256":"52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035"},"downloads":-1,"filename":"0805nexter-1.1.0.zip","has_sig":false,"md5_digest":"07fc93fc12821c1405c3483db88154af","packagetype":"sdist","python_version":"source","requires_python":null,"size":862,"upload_time":"2016-01-31T05:28:42","url":"https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip"}],"1.2.0":[{"comment_text":"","digests":{"md5":"89123c78bd5d3f61cb8f46029492b18a","sha256":"49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709"},"downloads":-1,"filename":"0805nexter-1.2.0.zip","has_sig":false,"md5_digest":"89123c78bd5d3f61cb8f46029492b18a","packagetype":"sdist","python_version":"source","requires_python":null,"size":898,"upload_time":"2016-01-31T05:51:25","url":"https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip"}]},"urls":[{"comment_text":"","digests":{"md5":"89123c78bd5d3f61cb8f46029492b18a","sha256":"49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709"},"downloads":-1,"filename":"0805nexter-1.2.0.zip","has_sig":false,"md5_digest":"89123c78bd5d3f61cb8f46029492b18a","packagetype":"sdist","python_version":"source","requires_python":null,"size":898,"upload_time":"2016-01-31T05:51:25","url":"https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip"}]} \ No newline at end of file diff --git a/swh/loader/pypi/tests/test_client.py b/swh/loader/pypi/tests/test_client.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/test_client.py @@ -0,0 +1,78 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from nose.tools import istest + +from swh.loader.pypi import converters +from swh.loader.pypi.client import _project_pkginfo + +from .common import WithProjectTest + + +class PyPIProjectTest(WithProjectTest): + @istest + def releases(self): + actual_releases = self.project.releases([]) + + expected_release_artifacts = { + '1.1.0': { + 'archive_type': 'zip', + 'blake2s256': 'df9413bde66e6133b10cadefad6fcf9cbbc369b47831089112c846d79f14985a', # noqa + 'date': '2016-01-31T05:28:42', + 'filename': '0805nexter-1.1.0.zip', + 'sha1': '127d8697db916ba1c67084052196a83319a25000', + 'sha1_git': '4b8f1350e6d9fa00256e974ae24c09543d85b196', + 'sha256': '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035', # noqa + 'size': 862, + 'url': 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa + }, + '1.2.0': { + 'archive_type': 'zip', + 'blake2s256': '67010586b5b9a4aaa3b1c386f9dc8b4c99e6e40f37732a717a5f9b9b1185e588', # noqa + 'date': '2016-01-31T05:51:25', + 'filename': '0805nexter-1.2.0.zip', + 'sha1': 'd55238554b94da7c5bf4a349ece0fe3b2b19f79c', + 'sha1_git': '8638d33a96cb25d8319af21417f00045ec6ee810', + 'sha256': '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709', # noqa + 'size': 898, + 'url': 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa + } + } + + expected_releases = { + '1.1.0': { + 'name': '1.1.0', + 'message': '', + }, + '1.2.0': { + 'name': '1.2.0', + 'message': '', + }, + } + + dir_paths = [] + for pkginfo, author, release, artifact, dir_path in actual_releases: + version = pkginfo['version'] + expected_pkginfo = _project_pkginfo(dir_path) + self.assertEquals(pkginfo, expected_pkginfo) + expected_author = converters.author(expected_pkginfo) + self.assertEqual(author, expected_author) + expected_artifact = expected_release_artifacts[version] + self.assertEqual(artifact, expected_artifact) + expected_release = expected_releases[version] + self.assertEqual(release, expected_release) + + self.assertTrue(version in dir_path) + self.assertTrue(self.project_name in dir_path) + # path still exists + self.assertTrue(os.path.exists(dir_path)) + dir_paths.append(dir_path) + + # Ensure uncompressed paths have been destroyed + for dir_path in dir_paths: + # path no longer exists + self.assertFalse(os.path.exists(dir_path)) diff --git a/swh/loader/pypi/tests/test_converters.py b/swh/loader/pypi/tests/test_converters.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/test_converters.py @@ -0,0 +1,113 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest import TestCase +from nose.tools import istest + +from swh.loader.pypi.converters import author, info + +from .common import WithProjectTest + + +class Test(WithProjectTest): + @istest + def info(self): + actual_info = self.project.info() + + expected_info = { + 'home_page': self.data['info']['home_page'], + 'description': self.data['info']['description'], + 'summary': self.data['info']['summary'], + 'license': self.data['info']['license'], + 'package_url': self.data['info']['package_url'], + 'project_url': self.data['info']['project_url'], + 'upstream': self.data['info']['project_urls']['Homepage'], + } + + self.assertEqual(expected_info, actual_info) + + @istest + def author(self): + info = self.data['info'] + actual_author = author(info) + + name = info['author'].encode('utf-8') + email = info['author_email'].encode('utf-8') + expected_author = { + 'fullname': b'%s <%s>' % (name, email), + 'name': name, + 'email': email, + } + + self.assertEqual(expected_author, actual_author) + + +class ParseAuthorTest(TestCase): + @istest + def author_basic(self): + data = { + 'author': "i-am-groot", + 'author_email': 'iam@groot.org', + } + actual_author = author(data) + + expected_author = { + 'fullname': b'i-am-groot ', + 'name': b'i-am-groot', + 'email': b'iam@groot.org', + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed(self): + data = { + 'author': "['pierre', 'paul', 'jacques']", + 'author_email': None, + } + + actual_author = author(data) + + expected_author = { + 'fullname': b"['pierre', 'paul', 'jacques']", + 'name': b"['pierre', 'paul', 'jacques']", + 'email': None, + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed_2(self): + data = { + 'author': '[marie, jeanne]', + 'author_email': '[marie@some, jeanne@thing]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', + 'name': b'[marie, jeanne]', + 'email': b'[marie@some, jeanne@thing]', + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed_3(self): + data = { + 'author': '[marie, jeanne, pierre]', + 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa + 'name': b'[marie, jeanne, pierre]', + 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', + } + + self.assertEquals(actual_author, expected_author) diff --git a/swh/loader/pypi/tests/test_loader.py b/swh/loader/pypi/tests/test_loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/test_loader.py @@ -0,0 +1,258 @@ +# Copyright (C) 2016-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import shutil +import tempfile + +from nose.plugins.attrib import attr +from nose.tools import istest +from unittest import TestCase + +from swh.model import hashutil + +from swh.loader.pypi.client import PyPIProject +from swh.loader.pypi.loader import PyPILoader +from .common import PyPIClientWithCache, RESOURCES_PATH, LoaderNoStorage + + +class TestPyPILoader(LoaderNoStorage, PyPILoader): + """Real PyPILoader for test purposes (storage and pypi interactions + inhibited) + + """ + def __init__(self, project_name): + project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project_name) + project_metadata_url = 'https://pypi.org/pypi/%s/json' % project_name + with open(project_metadata_file) as f: + data = json.load(f) + + temp_dir = tempfile.mkdtemp( + dir='/tmp/', prefix='swh.loader.pypi.tests-') + # Will use the pypi with cache + client = PyPIClientWithCache( + temp_directory=temp_dir, cache_dir=RESOURCES_PATH) + super().__init__(client=client) + self.project = PyPIProject( + client=client, + project=project_name, + project_metadata_url=project_metadata_url, + data=data) + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + self.visit = 1 # first visit + + +@attr('fs') +class BaseLoaderITest(TestCase): + """Loader Test Mixin to prepare the pypi to 'load' in a test context. + + In this setup, the loader uses the cache to load data so no + network interaction (no storage, no pypi). + + """ + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + self.tmp_root_path = tempfile.mkdtemp() + self.loader = PyPILoaderNoSnapshot(project_name=project_name) + self._project = project_name + self._origin_url = '%s/pypi/%s/' % (dummy_pypi_instance, project_name) + self._project_metadata_url = '%s/pypi/%s/json' % ( + dummy_pypi_instance, project_name) + + def tearDown(self): + shutil.rmtree(self.tmp_root_path) + + def assertContentsOk(self, expected_contents): + contents = self.loader.all_contents + self.assertEquals(len(contents), len(expected_contents)) + + for content in contents: + content_id = hashutil.hash_to_hex(content['sha1']) + self.assertIn(content_id, expected_contents) + + def assertDirectoriesOk(self, expected_directories): + directories = self.loader.all_directories + self.assertEquals(len(directories), len(expected_directories)) + + for _dir in directories: + _dir_id = hashutil.hash_to_hex(_dir['id']) + self.assertIn(_dir_id, expected_directories) + + def assertSnapshotOk(self, expected_snapshot, expected_revisions): + snapshots = self.loader.all_snapshots + self.assertEqual(len(snapshots), 1) + + snap = snapshots[0] + snap_id = hashutil.hash_to_hex(snap['id']) + self.assertEqual(snap_id, expected_snapshot) + + branches = snap['branches'] + self.assertEqual(len(expected_revisions), len(branches)) + + for branch, target in branches.items(): + rev_id = hashutil.hash_to_hex(target['target']) + self.assertIn(rev_id, expected_revisions) + self.assertEqual('revision', target['target_type']) + + def assertRevisionsOk(self, expected_revisions): # noqa: N802 + """Check the loader's revisions match the expected revisions. + + Expects self.loader to be instantiated and ready to be + inspected (meaning the loading took place). + + Args: + expected_revisions (dict): Dict with key revision id, + value the targeted directory id. + + """ + # The last revision being the one used later to start back from + for rev in self.loader.all_revisions: + rev_id = hashutil.hash_to_hex(rev['id']) + directory_id = hashutil.hash_to_hex(rev['directory']) + + self.assertEquals(expected_revisions[rev_id], directory_id) + + +# Define loaders with no storage +# They'll just accumulate the data in place +# Only for testing purposes. + + +class PyPILoaderNoSnapshot(TestPyPILoader): + """Same as TestPyPILoader with no prior snapshot seen + + """ + def _last_snapshot(self): + return None + + +class LoaderITest(BaseLoaderITest): + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + super().setUp(project_name, dummy_pypi_instance) + self.loader = PyPILoaderNoSnapshot(project_name=project_name) + + @istest + def load(self): + """Load a pypi origin + + """ + # when + self.loader.load( + self._project, self._origin_url, self._project_metadata_url) + + # then + self.assertEquals(len(self.loader.all_contents), 6, + '3 contents per release artifact files (2)') + self.assertEquals(len(self.loader.all_directories), 4) + self.assertEquals(len(self.loader.all_revisions), 2, + '2 releases so 2 revisions should be created') + self.assertEquals(len(self.loader.all_releases), 0, + 'No release is created in the pypi loader') + self.assertEquals(len(self.loader.all_snapshots), 1, + 'Only 1 snapshot targetting all revisions') + + expected_contents = [ + 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', + '938c33483285fd8ad57f15497f538320df82aeb8', + 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ] + + self.assertContentsOk(expected_contents) + + expected_directories = [ + '05219ba38bc542d4345d5638af1ed56c7d43ca7d', + 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ] + self.assertDirectoriesOk(expected_directories) + + # {revision hash: directory hash} + expected_revisions = { + '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + } + self.assertRevisionsOk(expected_revisions) + + self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', + expected_revisions) + + +class PyPILoaderWithSnapshot(TestPyPILoader): + """Same as TestPyPILoader with no prior snapshot seen + + """ + def _last_snapshot(self): + return { + 'id': b'\xf4V\xb0>\x8b\xf1\x92\rd\xb0\r\xf24\xb1\xef\xed\xc2[l\x93', # noqa + 'branches': { + b'0805nexter-1.1.0.zip': { + 'target': b'L\x99\x89\x1f\x93\xb8\x14P' + b'8Ww#Z7\xb5\xe9f\xdd\x15q', + 'target_type': 'revision' + }, + b'0805nexter-1.2.0.zip': { + 'target': b'\xe4E\xdaM\xa2+1\xbf' + b'\xeb\xb6\xff\xc48=\xbf\x83' + b'\x9a\x07M!', + 'target_type': 'revision' + }, + }, + } + + def _known_artifacts(self, last_snapshot): + yield from [ + ( + '0805nexter-1.1.0.zip', + '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035' # noqa + ), + ( + '0805nexter-1.2.0.zip', + '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709' # noqa + ) + ] + + +class LoaderWithOriginAlreadySeenITest(BaseLoaderITest): + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + super().setUp(project_name, dummy_pypi_instance) + self.loader = PyPILoaderWithSnapshot(project_name=project_name) + + @istest + def load(self): + """Load a pypi origin already injected will result with only 1 snapshot + + """ + # when + self.loader.load( + self._project, self._origin_url, self._project_metadata_url) + + # then + self.assertEquals(len(self.loader.all_contents), 0) + self.assertEquals(len(self.loader.all_directories), 0) + self.assertEquals(len(self.loader.all_revisions), 0) + self.assertEquals(len(self.loader.all_releases), 0) + self.assertEquals(len(self.loader.all_snapshots), 1) + + self.assertContentsOk([]) + self.assertDirectoriesOk([]) + self.assertRevisionsOk(expected_revisions={}) + + expected_revisions = { + '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + } + self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', + expected_revisions)