diff --git a/debian/changelog b/debian/changelog --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -swh-loader-pypy (0.0.1-1) unstable; urgency=low +swh-loader-pypi (0.0.1-1) unstable; urgency=low * Initial bootstrap diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -5,7 +5,10 @@ Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, + python3-arrow, python3-nose, + python3-pkginfo, + python3-requests, python3-setuptools, python3-swh.core, python3-swh.storage, diff --git a/debian/rules b/debian/rules --- a/debian/rules +++ b/debian/rules @@ -9,3 +9,4 @@ override_dh_install: dh_install rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/__init__.py + rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/loader/__init__.py diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ setuptools vcversioner +requests +arrow +pkginfo diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -35,6 +35,6 @@ install_requires=parse_requirements() + parse_requirements('swh'), test_requires=parse_requirements('test'), setup_requires=['vcversioner'], - vcversioner={}, + vcversioner={'version_module_paths': ['swh/loader/pypi/_version.py']}, include_package_data=True, ) diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py --- a/swh/loader/__init__.py +++ b/swh/loader/__init__.py @@ -0,0 +1 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) diff --git a/swh/loader/pypi/.gitignore b/swh/loader/pypi/.gitignore new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/.gitignore @@ -0,0 +1 @@ +_version.py diff --git a/swh/loader/pypi/client.py b/swh/loader/pypi/client.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/client.py @@ -0,0 +1,262 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import hashlib +import logging +import os +import requests + +from pkginfo import UnpackedSDist +from shutil import copyfile + +from swh.core import tarball +from swh.model import hashutil + +try: + from swh.loader.pypi._version import __version__ +except ImportError: + __version__ = 'devel' + + +def convert_to_hex(d): + """Convert a flat dictionary with bytes in values to the same dictionary + with hex as values. + + Args: + dict: flat dictionary with sha bytes in their values. + + Returns: + Mirror dictionary with values as string hex. + + """ + if not d: + return d + + checksums = {} + for key, h in d.items(): + if isinstance(h, bytes): + checksums[key] = hashutil.hash_to_hex(h) + else: + checksums[key] = h + + return checksums + + +def _to_dict(pkginfo): + """Given a pkginfo file, convert it to a dict. + + """ + m = {} + for k in pkginfo: + m[k] = getattr(pkginfo, k) + return m + + +def _project_pkginfo(dir_path): + """Given an uncompressed path holding the pkginfo path, returns a pkginfo. + + """ + project_dirname = os.listdir(dir_path)[0] + pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') + if not os.path.exists(pkginfo_path): + return None + pkginfo = UnpackedSDist(pkginfo_path) + return _to_dict(pkginfo) + + +class PyPiClient: + """PyPi client in charge of discussing with the pypi server. + + """ + def __init__(self, temp_directory=None, cache=False, cache_dir=None): + self.version = __version__ + self.temp_directory = temp_directory + + self.do_cache = cache + if self.do_cache: + self.cache_dir = cache_dir + self.cache_raw_dir = os.path.join(cache_dir, 'archives') + os.makedirs(self.cache_raw_dir, exist_ok=True) + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( + __version__ + ) + } + } + + def _save_response(self, response, project=None): + """Log the response from a server request to a cache dir. + + Args: + response (Response): full server response + cache_dir (str): system path for cache dir + + Returns: + nothing + + """ + import gzip + from json import dumps + datepath = arrow.utcnow().isoformat() + name = '%s.gz' % datepath if project is None else '%s-%s.gz' % ( + project, datepath) + fname = os.path.join(self.cache_dir, name) + with gzip.open(fname, 'w') as f: + f.write(bytes( + dumps(response.json()), + 'utf-8' + )) + + def _save_raw(self, filepath): + """In cache mode, backup the filepath to self.cache_raw_dir + + Args: + filepath (str): Path of the file to save + + """ + _filename = os.path.basename(filepath) + _archive = os.path.join(self.cache_raw_dir, _filename) + copyfile(filepath, _archive) + + def _get_raw(self, filepath): + """In cache mode, we try to retrieve the cached file. + + """ + _filename = os.path.basename(filepath) + _archive = os.path.join(self.cache_raw_dir, _filename) + if not os.path.exists(_archive): + return None + copyfile(_archive, filepath) + return filepath + + def _get(self, url, project=None): + """Get query to the url. + + Args: + url (str): Url + + Raises: + ValueError in case of failing to query + + Returns: + Response as dict if ok + + """ + response = self.session.get(url, **self.params) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + + if self.do_cache: + self._save_response(response, project=project) + + return response.json() + + def info(self, project_url, project=None): + """Given a metadata project url, retrieve the raw json response + + Args: + project_url (str): Project's pypi to retrieve information + + Returns: + Main project information as dict. + + """ + return self._get(project_url, project=project) + + def release(self, project, release): + """Given a project and a release name, retrieve the raw information + for said project's release. + + Args: + project (str): Project's name + release (dict): Release information + + Returns: + Release information as dict + + """ + release_url = 'https://pypi.org/pypi/%s/%s/json' % (project, release) + return self._get(release_url, project=project) + + def fetch_release_artifact(self, project, release): + """Fetch for a given release project the associated artifact. + + This: + - fetches the artifact + - checks the size, hashes match + - uncompress the artifact locally + - computes the swh hashes + - returns the associated information for the artifact + + Args: + project (str): Project's name + release (dict): Release information + + Returns: + tuple (release, archive_path, uncompress_archive_path, pkginfo): + + release (dict): release information + archive_path (str): fetched archive + uncompressed_archive_path (str): uncompressed archive path + pkginfo (dict): package information or None if none found + + """ + version = release['name'] + logging.debug('Release version: %s' % version) + path = os.path.join(self.temp_directory, project, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, release['filename']) + logging.debug('Release local path: %s' % filepath) + + _filepath = None + if self.do_cache: + _filepath = self._get_raw(filepath) + + if not _filepath: # no cache hit, we fetch from pypi + url = release['url'] + r = self.session.get(url, **self.params) + if r.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, r.status_code)) + + _len = len(r.content) + if _len != release['size']: + raise ValueError('Error when checking size: %s != %s' % ( + release['size'], _len)) + + # checking digest and writing + h = hashlib.sha256() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(): + h.update(chunk) + f.write(chunk) + + actual_digest = h.hexdigest() + if actual_digest != release['sha256']: + raise ValueError( + '%s %s: Checksum mismatched: %s != %s' % ( + project, version, release['sha256'], actual_digest)) + + if self.do_cache: + self._save_raw(filepath) + + uncompress_path = os.path.join(path, 'uncompress') + os.makedirs(uncompress_path, exist_ok=True) + + nature = tarball.uncompress(filepath, uncompress_path) + + hashes = hashutil.hash_path(filepath) + hashes.pop('length') # 'size' entry is already referenced + artifact = convert_to_hex(hashes) + artifact['archive_type'] = nature + for key, value in artifact.items(): + release[key] = value + + pkginfo = _project_pkginfo(uncompress_path) + return release, filepath, uncompress_path, pkginfo diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/loader.py @@ -0,0 +1,206 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import os +import shutil + +from tempfile import mkdtemp + +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import SWHLoader +from swh.model.from_disk import Directory +from swh.model.identifiers import ( + revision_identifier, snapshot_identifier, + identifier_to_bytes, normalize_timestamp +) + +from .client import PyPiClient +from .model import PyPiProject + + +TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' +DEBUG_MODE = '** DEBUG MODE **' + + +class PyPiLoader(SWHLoader): + CONFIG_BASE_FILENAME = 'loader/pypi' + ADDITIONAL_CONFIG = { + 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), + 'cache': ('bool', False), + 'cache_dir': ('str', ''), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + + def __init__(self, client=None): + super().__init__(logging_class='swh.loader.pypi.PyPiLoader') + self.origin_id = None + if not client: + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + self.pypi_client = PyPiClient( + temp_directory=self.temp_directory, + cache=self.config['cache'], + cache_dir=self.config['cache_dir']) + else: + self.temp_directory = client.temp_directory + self.pypi_client = client + self.debug = self.config['debug'] + + def pre_cleanup(self): + """(override) To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def cleanup(self): + """(override) Clean up temporary disk use + + """ + if self.debug: + self.log.warn('%s Will not clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) + + def prepare_origin_visit(self, project_name, origin_url, + origin_metadata_url=None): + """(override) Prepare the origin visit information + + Args: + project_name (str): Project's simple name + origin_url (str): Project's main url + origin_metadata_url (str): Project's metadata url + + """ + self.origin = { + 'url': origin_url, + 'type': 'pypi', + } + self.visit_date = None # loader core will populate it + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + """(override) Keep reference to the origin url (project) and the + project metadata url + + Args: + project_name (str): Project's simple name + origin_url (str): Project's main url + origin_metadata_url (str): Project's metadata url + + """ + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + self.project = PyPiProject(self.pypi_client, self.project_name, + self.origin_metadata_url) + + def _known_releases(self, _last_snapshot): + """Retrieve the known releases/artifact for the origin_id. + + Returns + tuple artifact's filename, artifact's sha256 + + """ + _revs = [rev['target'] for rev in _last_snapshot['branches'].values()] + _known_revisions = self.storage.revision_get(_revs) + for _rev in _known_revisions: + _artifact = _rev['metadata']['original_artifact'] + yield _artifact['filename'], _artifact['sha256'] + + def _last_snapshot(self): + """Retrieve the last snapshot + + """ + return self.storage.snapshot_get_latest(self.origin_id) + + def fetch_data(self): + """(override) Fetch and collect swh objects. + + """ + _last_snapshot = self._last_snapshot() + if _last_snapshot: + self._snapshot = _last_snapshot.copy() + _known_releases = self._known_releases(self._snapshot) + else: + self._snapshot = { + 'branches': {} + } + _known_releases = [] + + self._contents = [] + self._directories = [] + self._revisions = [] + + for release_info, author, release, dirpath in self.project.releases( + _known_releases): + dirpath = dirpath.encode('utf-8') + directory = Directory.from_disk(path=dirpath, data=True) + _objects = directory.collect() + + self._contents.extend(_objects['content'].values()) + self._directories.extend(_objects['directory'].values()) + date = normalize_timestamp( + int(arrow.get(release['date']).timestamp)) + + name = release['name'].encode('utf-8') + message = release['message'].encode('utf-8') + if message: + message = b'%s: %s' % (name, message) + else: + message = name + + _revision = { + 'synthetic': True, + 'metadata': { + 'original_artifact': release, + 'project': release_info, + }, + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'message': message, + 'directory': directory.hash, + 'parents': [], + 'type': 'tar', + } + _revision['id'] = identifier_to_bytes( + revision_identifier(_revision)) + self._revisions.append(_revision) + + branch_name = release['filename'].encode('utf-8') + self._snapshot['branches'][branch_name] = { + 'target': _revision['id'], + 'target_type': 'revision', + } + + self._snapshot['id'] = identifier_to_bytes( + snapshot_identifier(self._snapshot)) + + def store_data(self): + """(override) This sends collected objects to storage. + + """ + self.maybe_load_contents(self._contents) + self.maybe_load_directories(self._directories) + self.maybe_load_revisions(self._revisions) + self.maybe_load_snapshot(self._snapshot) diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/model.py @@ -0,0 +1,216 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import logging +import shutil + + +def info(data): + """Given a dict of data, returns a project subset. + + """ + _info = data['info'] + default = { + 'home_page': _info['home_page'], + 'description': _info['description'], + 'summary': _info['summary'], + 'license': _info['license'], + 'package_url': _info['package_url'], + 'project_url': _info['project_url'], + 'upstream': None, + } + + project_urls = _info.get('project_urls') + if project_urls: + homepage = project_urls.get('Homepage') + if homepage: + default['upstream'] = homepage + + return default + + +def author(data): + """Given a dict of data, returns an author subset. + + """ + name = data['author'] + email = data['author_email'] + if email: + fullname = '%s <%s>' % (name, email) + else: + fullname = name + + if not fullname: + return {'fullname': b'', 'name': None, 'email': None} + + if fullname: + fullname = fullname.encode('utf-8') + + if name: + name = name.encode('utf-8') + + if email: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} + + +class PyPiProject: + """PyPi project representation + + This permits to extract information for the: + - project, either the latest information (from the last revision) + - either the information for a given release + - Symmetrically for the release author information + + This also fetches and uncompress the associated release artifacts. + + """ + def __init__(self, client, project, project_metadata_url, data=None): + self.client = client + self.project = project + self.project_metadata_url = project_metadata_url + if data: + self.data = data + else: + self.data = client.info(project_metadata_url, project) + + self.last_version = self.data['info']['version'] + self.cache = { + self.last_version: self.data + } + + def _data(self, release_name=None): + """Fetch data per release and cache it. Returns the cache retrieved + data if already fetched. + + """ + if release_name: + data = self.cache.get(release_name) + if not data: + data = self.client.release(self.project, release_name) + self.cache[release_name] = data + else: + data = self.data + return data + + def info(self, release_name=None): + """Compute release information for provided release (or latest one). + + """ + return info(self._data(release_name)) + + def _filter_releases(self, version, release, known_releases): + """Filter not already known sdist (source distribution) release. + + There can be multiple 'package_type' (sdist, bdist_egg, + bdist_wheel, bdist_rpm, bdist_msi, bdist_wininst, ...), we are + only interested in source distribution (sdist), others bdist* + are binary + + Args: + version (str): Release name or version + release (dict/[dict]): Full release object (or list of) + known_releases ([tuple]): List of known releases (tuple filename, + sha256) + + """ + if not release: + return [] + if not isinstance(release, list): + release = [release] + _releases = [] + for _rel in release: + _name = _rel['filename'] + _sha256 = _rel['digests']['sha256'] + if (_name, _sha256) in known_releases: + logging.debug('artifact (%s, %s) already seen for release %s, skipping' % ( # noqa + _name, _sha256, version)) + continue + if _rel['packagetype'] != 'sdist': + continue + _releases.append(_rel) + return _releases + + def _cleanup_release_artifacts(self, archive_path, directory_path): + """Clean intermediary files which no longer needs to be present. + + """ + if directory_path and os.path.exists(directory_path): + logging.debug('Clean up uncompressed archive path %s' % ( + directory_path, )) + shutil.rmtree(directory_path) + + if archive_path and os.path.exists(archive_path): + logging.debug('Clean up archive %s' % archive_path) + os.unlink(archive_path) + + def _fetch_and_uncompress_releases(self, version, releases): + """Fetch an uncompress sdist releases + + Args: + version (str): Release name or version + releases ([dict]): List of source distribution release artifacts + + Yields: + tuple (release, filepath, uncompressed_path) + + """ + for release in releases: + # flatten the metadata to ease reading + _flattenned_release = { + 'name': version, + 'message': release.get('comment_text', ''), + 'sha256': release['digests']['sha256'], + 'size': release['size'], + 'filename': release['filename'], + 'url': release['url'], + 'date': release['upload_time'], + } + + # fetch and write locally archives + yield self.client.fetch_release_artifact( + self.project, _flattenned_release) + + def releases(self, known_releases): + """Fetch metadata/data per release (if new release artifact detected) + + For new release artifact, this: + - downloads and uncompresses the release artifacts. + - yields the (release info, author info, release, dir_path) + - Clean up the intermediary fetched artifact files + + Args: + known_releases (tuple): artifact name, artifact sha256 hash + + Yields: + tuple (version, release_info, release, uncompressed_path) where: + + - release_info (dict): release's associated version info + - author (dict): Author information for the release + - release (dict): release metadata + - uncompressed_path (str): Path to uncompressed artifact + + """ + releases_dict = self.data['releases'] + for version, releases in releases_dict.items(): + releases = self._filter_releases(version, releases, known_releases) + if not releases: + logging.warn('%s %s: No source artifact found, skipping' % ( + self.project, version)) + continue + + _releases = self._fetch_and_uncompress_releases(version, releases) + for _release, _archive, _dir_path, _pkginfo in _releases: + _release_info = _pkginfo + if _release_info is None: # fallback to pypi api metadata + msg = '%s %s: No PKG-INFO detected, skipping' % ( # noqa + self.project, _release['name']) + logging.warn(msg) + continue + _author = author(_release_info) + yield _release_info, _author, _release, _dir_path + self._cleanup_release_artifacts(_archive, _dir_path) diff --git a/swh/loader/pypi/tasks.py b/swh/loader/pypi/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.task import Task + +from .loader import PyPiLoader + + +class LoadPyPiTsk(Task): + task_queue = 'swh_loader_pypi' + + def run_task(self, project_name, project_url, project_metadata_url=None): + loader = PyPiLoader() + loader.log = self.log + return loader.load(project_name, + project_url, + origin_metadata_url=project_metadata_url) diff --git a/swh/loader/__init__.py b/swh/loader/pypi/tests/__init__.py copy from swh/loader/__init__.py copy to swh/loader/pypi/tests/__init__.py diff --git a/swh/loader/pypi/tests/common.py b/swh/loader/pypi/tests/common.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/common.py @@ -0,0 +1,114 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.loader.pypi.client import PyPiClient + + +RESOURCES_PATH = './swh/loader/pypi/tests/resources' + + +class PyPiClientWithCache(PyPiClient): + """Force the use of the cache to bypass pypi calls + + """ + def __init__(self, temp_directory, cache_dir): + super().__init__(temp_directory=temp_directory, + cache=True, cache_dir=cache_dir) + + +class LoaderNoStorage: + """Mixin class to inhibit the persistence (storage calls) and keep in + memory the data sent. + + """ + CONFIG_BASE_FILENAME = '' # do not provide a real path + ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://nowhere:5002/', # do not provide a real storage + } + }), + + # do not send any data to the storage + 'send_contents': ('bool', False), + 'send_directories': ('bool', False), + 'send_revisions': ('bool', False), + 'send_releases': ('bool', False), + 'send_snapshot': ('bool', False), + 'debug': ('bool', False), + } + + def __init__(self, client=None): + super().__init__(client=client) + self.all_contents = [] + self.all_directories = [] + self.all_revisions = [] + self.all_releases = [] + self.all_snapshots = [] + + # typed data + self.objects = { + 'content': self.all_contents, + 'directory': self.all_directories, + 'revision': self.all_revisions, + 'release': self.all_releases, + 'snapshot': self.all_snapshots + } + + def _add(self, type, l): + """Add without duplicates and keeping the insertion order. + + Args: + type (str): Type of objects concerned by the action + l ([object]): List of 'type' object + + """ + col = self.objects[type] + for o in l: + if o in col: + continue + col.extend([o]) + + def maybe_load_contents(self, all_contents): + self._add('content', all_contents) + + def maybe_load_directories(self, all_directories): + self._add('directory', all_directories) + + def maybe_load_revisions(self, all_revisions): + self._add('revision', all_revisions) + + def maybe_load_releases(self, releases): + raise ValueError('If called, the test must break.') + + def maybe_load_snapshot(self, snapshot): + self.objects['snapshot'].append(snapshot) + + def _store_origin_visit(self): + pass + + def open_fetch_history(self): + pass + + def close_fetch_history_success(self, fetch_history_id): + pass + + def close_fetch_history_failure(self, fetch_history_id): + pass + + def update_origin_visit(self, origin_id, visit, status): + pass + + # Override to do nothing at the end + def close_failure(self): + pass + + def close_success(self): + pass + + def pre_cleanup(self): + pass diff --git a/swh/loader/pypi/tests/resources/0805nexter.json b/swh/loader/pypi/tests/resources/0805nexter.json new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/resources/0805nexter.json @@ -0,0 +1 @@ +{"info":{"author":"hgtkpython","author_email":"2868989685@qq.com","bugtrack_url":null,"classifiers":[],"description":"UNKNOWN","description_content_type":null,"docs_url":null,"download_url":"UNKNOWN","downloads":{"last_day":-1,"last_month":-1,"last_week":-1},"home_page":"http://www.hp.com","keywords":null,"license":"UNKNOWN","maintainer":null,"maintainer_email":null,"name":"0805nexter","package_url":"https://pypi.org/project/0805nexter/","platform":"UNKNOWN","project_url":"https://pypi.org/project/0805nexter/","project_urls":{"Download":"UNKNOWN","Homepage":"http://www.hp.com"},"release_url":"https://pypi.org/project/0805nexter/1.2.0/","requires_dist":null,"requires_python":null,"summary":"a simple printer of nested lest","version":"1.2.0"},"last_serial":1931736,"releases":{"1.1.0":[{"comment_text":"","digests":{"md5":"07fc93fc12821c1405c3483db88154af","sha256":"52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035"},"downloads":-1,"filename":"0805nexter-1.1.0.zip","has_sig":false,"md5_digest":"07fc93fc12821c1405c3483db88154af","packagetype":"sdist","python_version":"source","requires_python":null,"size":862,"upload_time":"2016-01-31T05:28:42","url":"https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip"}],"1.2.0":[{"comment_text":"","digests":{"md5":"89123c78bd5d3f61cb8f46029492b18a","sha256":"49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709"},"downloads":-1,"filename":"0805nexter-1.2.0.zip","has_sig":false,"md5_digest":"89123c78bd5d3f61cb8f46029492b18a","packagetype":"sdist","python_version":"source","requires_python":null,"size":898,"upload_time":"2016-01-31T05:51:25","url":"https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip"}]},"urls":[{"comment_text":"","digests":{"md5":"89123c78bd5d3f61cb8f46029492b18a","sha256":"49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709"},"downloads":-1,"filename":"0805nexter-1.2.0.zip","has_sig":false,"md5_digest":"89123c78bd5d3f61cb8f46029492b18a","packagetype":"sdist","python_version":"source","requires_python":null,"size":898,"upload_time":"2016-01-31T05:51:25","url":"https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip"}]} \ No newline at end of file diff --git a/swh/loader/pypi/tests/test_loader.py b/swh/loader/pypi/tests/test_loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/test_loader.py @@ -0,0 +1,258 @@ +# Copyright (C) 2016-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import shutil +import tempfile + +from nose.plugins.attrib import attr +from nose.tools import istest +from unittest import TestCase + +from swh.model import hashutil + +from swh.loader.pypi.model import PyPiProject +from swh.loader.pypi.loader import PyPiLoader +from .common import PyPiClientWithCache, RESOURCES_PATH, LoaderNoStorage + + +class TestPyPiLoader(LoaderNoStorage, PyPiLoader): + """Real PyPiLoader for test purposes (storage and pypi interactions + inhibited) + + """ + def __init__(self, project_name, ): + project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project_name) + project_metadata_url = 'https://pypi.org/pypi/%s/json' % project_name + with open(project_metadata_file) as f: + data = json.load(f) + + temp_dir = tempfile.mkdtemp( + dir='/tmp/', prefix='swh.loader.pypi.tests-') + # Will use the pypi with cache + client = PyPiClientWithCache( + temp_directory=temp_dir, cache_dir=RESOURCES_PATH) + super().__init__(client=client) + self.project = PyPiProject( + client=client, + project=project_name, + project_metadata_url=project_metadata_url, + data=data) + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + self.visit = 1 # first visit + + +@attr('fs') +class BaseLoaderITest(TestCase): + """Loader Test Mixin to prepare the pypi to 'load' in a test context. + + In this setup, the loader uses the cache to load data so no + network interaction (no storage, no pypi). + + """ + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + self.tmp_root_path = tempfile.mkdtemp() + self.loader = PyPiLoaderNoSnapshot(project_name=project_name) + self._project = project_name + self._origin_url = '%s/pypi/%s/' % (dummy_pypi_instance, project_name) + self._project_metadata_url = '%s/pypi/%s/json' % ( + dummy_pypi_instance, project_name) + + def tearDown(self): + shutil.rmtree(self.tmp_root_path) + + def assertContentsOk(self, expected_contents): + contents = self.loader.all_contents + self.assertEquals(len(contents), len(expected_contents)) + + for content in contents: + content_id = hashutil.hash_to_hex(content['sha1']) + self.assertIn(content_id, expected_contents) + + def assertDirectoriesOk(self, expected_directories): + directories = self.loader.all_directories + self.assertEquals(len(directories), len(expected_directories)) + + for _dir in directories: + _dir_id = hashutil.hash_to_hex(_dir['id']) + self.assertIn(_dir_id, expected_directories) + + def assertSnapshotOk(self, expected_snapshot, expected_revisions): + snapshots = self.loader.all_snapshots + self.assertEqual(len(snapshots), 1) + + snap = snapshots[0] + snap_id = hashutil.hash_to_hex(snap['id']) + self.assertEqual(snap_id, expected_snapshot) + + branches = snap['branches'] + self.assertEqual(len(expected_revisions), len(branches)) + + for branch, target in branches.items(): + rev_id = hashutil.hash_to_hex(target['target']) + self.assertIn(rev_id, expected_revisions) + self.assertEqual('revision', target['target_type']) + + def assertRevisionsOk(self, expected_revisions): # noqa: N802 + """Check the loader's revisions match the expected revisions. + + Expects self.loader to be instantiated and ready to be + inspected (meaning the loading took place). + + Args: + expected_revisions (dict): Dict with key revision id, + value the targeted directory id. + + """ + # The last revision being the one used later to start back from + for rev in self.loader.all_revisions: + rev_id = hashutil.hash_to_hex(rev['id']) + directory_id = hashutil.hash_to_hex(rev['directory']) + + self.assertEquals(expected_revisions[rev_id], directory_id) + + +# Define loaders with no storage +# They'll just accumulate the data in place +# Only for testing purposes. + + +class PyPiLoaderNoSnapshot(TestPyPiLoader): + """Same as TestPyPiLoader with no prior snapshot seen + + """ + def _last_snapshot(self): + return None + + +class LoaderITest(BaseLoaderITest): + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + super().setUp(project_name, dummy_pypi_instance) + self.loader = PyPiLoaderNoSnapshot(project_name=project_name) + + @istest + def load(self): + """Load a pypi origin + + """ + # when + self.loader.load( + self._project, self._origin_url, self._project_metadata_url) + + # then + self.assertEquals(len(self.loader.all_contents), 6, + '3 contents per release artifact files (2)') + self.assertEquals(len(self.loader.all_directories), 4) + self.assertEquals(len(self.loader.all_revisions), 2, + '2 releases so 2 revisions should be created') + self.assertEquals(len(self.loader.all_releases), 0, + 'No release is created in the pypi loader') + self.assertEquals(len(self.loader.all_snapshots), 1, + 'Only 1 snapshot targetting all revisions') + + expected_contents = [ + 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', + '938c33483285fd8ad57f15497f538320df82aeb8', + 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ] + + self.assertContentsOk(expected_contents) + + expected_directories = [ + '05219ba38bc542d4345d5638af1ed56c7d43ca7d', + 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ] + self.assertDirectoriesOk(expected_directories) + + # {revision hash: directory hash} + expected_revisions = { + '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + } + self.assertRevisionsOk(expected_revisions) + + self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', + expected_revisions) + + +class PyPiLoaderWithSnapshot(TestPyPiLoader): + """Same as TestPyPiLoader with no prior snapshot seen + + """ + def _last_snapshot(self): + return { + 'id': b'\xf4V\xb0>\x8b\xf1\x92\rd\xb0\r\xf24\xb1\xef\xed\xc2[l\x93', + 'branches': { + b'0805nexter-1.1.0.zip': { + 'target': b'L\x99\x89\x1f\x93\xb8\x14P' + b'8Ww#Z7\xb5\xe9f\xdd\x15q', + 'target_type': 'revision' + }, + b'0805nexter-1.2.0.zip': { + 'target': b'\xe4E\xdaM\xa2+1\xbf' + b'\xeb\xb6\xff\xc48=\xbf\x83' + b'\x9a\x07M!', + 'target_type': 'revision' + }, + }, + } + + def _known_releases(self, last_snapshot): + yield from [ + ( + '0805nexter-1.1.0.zip', + '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035' # noqa + ), + ( + '0805nexter-1.2.0.zip', + '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709' # noqa + ) + ] + + +class LoaderWithOriginAlreadySeenITest(BaseLoaderITest): + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + super().setUp(project_name, dummy_pypi_instance) + self.loader = PyPiLoaderWithSnapshot(project_name=project_name) + + @istest + def load(self): + """Load a pypi origin already injected will result with only 1 snapshot + + """ + # when + self.loader.load( + self._project, self._origin_url, self._project_metadata_url) + + # then + self.assertEquals(len(self.loader.all_contents), 0) + self.assertEquals(len(self.loader.all_directories), 0) + self.assertEquals(len(self.loader.all_revisions), 0) + self.assertEquals(len(self.loader.all_releases), 0) + self.assertEquals(len(self.loader.all_snapshots), 1) + + self.assertContentsOk([]) + self.assertDirectoriesOk([]) + self.assertRevisionsOk(expected_revisions={}) + + expected_revisions = { + '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + } + self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', + expected_revisions) diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py new file mode 100644 --- /dev/null +++ b/swh/loader/pypi/tests/test_model.py @@ -0,0 +1,200 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import json +import tempfile +import shutil + +from unittest import TestCase +from nose.tools import istest + +from swh.loader.pypi.model import PyPiProject, author +from swh.loader.pypi.client import _project_pkginfo + +from .common import PyPiClientWithCache, RESOURCES_PATH + + +class ModelTest(TestCase): + def setUp(self): + project = '0805nexter' + project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project) + + with open(project_metadata_file) as f: + data = json.load(f) + + temp_dir = tempfile.mkdtemp( + dir='/tmp/', prefix='swh.loader.pypi.tests-') + project_metadata_url = 'https://pypi.org/pypi/%s/json' % project + # Will use the pypi with cache + client = PyPiClientWithCache( + temp_directory=temp_dir, cache_dir=RESOURCES_PATH) + self.project = PyPiProject( + client=client, + project=project, + project_metadata_url=project_metadata_url, + data=data) + + self.data = data + self.temp_dir = temp_dir + self.project_name = project + + def tearDown(self): + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + @istest + def info(self): + actual_info = self.project.info() + + expected_info = { + 'home_page': self.data['info']['home_page'], + 'description': self.data['info']['description'], + 'summary': self.data['info']['summary'], + 'license': self.data['info']['license'], + 'package_url': self.data['info']['package_url'], + 'project_url': self.data['info']['project_url'], + 'upstream': self.data['info']['project_urls']['Homepage'], + } + + self.assertEqual(expected_info, actual_info) + + @istest + def author(self): + info = self.data['info'] + actual_author = author(info) + + name = info['author'].encode('utf-8') + email = info['author_email'].encode('utf-8') + expected_author = { + 'fullname': b'%s <%s>' % (name, email), + 'name': name, + 'email': email, + } + + self.assertEqual(expected_author, actual_author) + + @istest + def releases(self): + actual_releases = self.project.releases([]) + + expected_releases = { + '1.1.0': { + 'archive_type': 'zip', + 'blake2s256': 'df9413bde66e6133b10cadefad6fcf9cbbc369b47831089112c846d79f14985a', # noqa + 'date': '2016-01-31T05:28:42', + 'filename': '0805nexter-1.1.0.zip', + 'message': '', + 'name': '1.1.0', + 'sha1': '127d8697db916ba1c67084052196a83319a25000', + 'sha1_git': '4b8f1350e6d9fa00256e974ae24c09543d85b196', + 'sha256': '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035', # noqa + 'size': 862, + 'url': 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa + }, + '1.2.0': { + 'archive_type': 'zip', + 'blake2s256': '67010586b5b9a4aaa3b1c386f9dc8b4c99e6e40f37732a717a5f9b9b1185e588', # noqa + 'date': '2016-01-31T05:51:25', + 'filename': '0805nexter-1.2.0.zip', + 'message': '', + 'name': '1.2.0', + 'sha1': 'd55238554b94da7c5bf4a349ece0fe3b2b19f79c', + 'sha1_git': '8638d33a96cb25d8319af21417f00045ec6ee810', + 'sha256': '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709', # noqa + 'size': 898, + 'url': 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa + } + } + + dir_paths = [] + for _release_info, _author, _release, _dir_path in actual_releases: + version = _release_info['version'] + expected_pkginfo = _project_pkginfo(_dir_path) + self.assertEquals(_release_info, expected_pkginfo) + expected_author = author(expected_pkginfo) + self.assertEqual(_author, expected_author) + expected_release = expected_releases[version] + self.assertEqual(_release, expected_release) + + self.assertTrue(version in _dir_path) + self.assertTrue(self.project_name in _dir_path) + # path still exists + self.assertTrue(os.path.exists(_dir_path)) + dir_paths.append(_dir_path) + + # Ensure uncompressed paths have been destroyed + for _dir_path in dir_paths: + # path no longer exists + self.assertFalse(os.path.exists(_dir_path)) + + +class ParseAuthorTest(TestCase): + @istest + def author_basic(self): + data = { + 'author': "i-am-groot", + 'author_email': 'iam@groot.org', + } + actual_author = author(data) + + expected_author = { + 'fullname': b'i-am-groot ', + 'name': b'i-am-groot', + 'email': b'iam@groot.org', + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed(self): + data = { + 'author': "['pierre', 'paul', 'jacques']", + 'author_email': None, + } + + actual_author = author(data) + + expected_author = { + 'fullname': b"['pierre', 'paul', 'jacques']", + 'name': b"['pierre', 'paul', 'jacques']", + 'email': None, + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed_2(self): + data = { + 'author': '[marie, jeanne]', + 'author_email': '[marie@some, jeanne@thing]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', + 'name': b'[marie, jeanne]', + 'email': b'[marie@some, jeanne@thing]', + } + + self.assertEquals(actual_author, expected_author) + + @istest + def author_malformed_3(self): + data = { + 'author': '[marie, jeanne, pierre]', + 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', + } + + actual_author = author(data) + + expected_author = { + 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa + 'name': b'[marie, jeanne, pierre]', + 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', + } + + self.assertEquals(actual_author, expected_author)