diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py index a3a59f1..6b9b69e 100644 --- a/swh/loader/pypi/model.py +++ b/swh/loader/pypi/model.py @@ -1,205 +1,212 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging import shutil def info(data): """Given a dict of data, returns a project subset. """ _info = data['info'] default = { 'home_page': _info['home_page'], 'description': _info['description'], 'summary': _info['summary'], 'license': _info['license'], 'package_url': _info['package_url'], 'project_url': _info['project_url'], 'upstream': None, } project_urls = _info.get('project_urls') if project_urls: homepage = project_urls.get('Homepage') if homepage: default['upstream'] = homepage return default def author(data): """Given a dict of data, returns an author subset. """ name = data['author'] email = data['author_email'] if email: fullname = '%s <%s>' % (name, email) else: fullname = name if not fullname: return {'fullname': b'', 'name': None, 'email': None} if fullname: fullname = fullname.encode('utf-8') if name: name = name.encode('utf-8') if email: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} class PyPiProject: """PyPi project representation This permits to extract information for the: - project, either the latest information (from the last revision) - either the information for a given release - Symmetrically for the release author information This also fetches and uncompress the associated release artifacts. """ def __init__(self, client, project, project_metadata_url, data=None): self.client = client self.project = project self.project_metadata_url = project_metadata_url if data: self.data = data else: self.data = client.info(project_metadata_url, project) self.last_version = self.data['info']['version'] self.cache = { self.last_version: self.data } def _data(self, release_name=None): """Fetch data per release and cache it. Returns the cache retrieved data if already fetched. """ if release_name: data = self.cache.get(release_name) if not data: data = self.client.release(self.project, release_name) self.cache[release_name] = data else: data = self.data return data def info(self, release_name=None): - """Compute release information for release provided or the latest one. + """Compute release information for provided release (or latest one). """ return info(self._data(release_name)) + def author(self, release_name=None): + """Compute author information for provided release (or latest one). + + """ + data = self._data(release_name) + return author(data['info']) + def _filter_releases(self, version, release): """Filter release to keep only sdist (source distribution) There can be multiple 'package_type' (sdist, bdist_egg, bdist_wheel, bdist_rpm, bdist_msi, bdist_wininst, ...), we are only interested in source distribution (sdist), others bdist* are binary Args: version (str): Release name or version release (dict): Full release object """ if not release: return [] if not isinstance(release, list): release = [release] # Filter only on 'sdist' package type return [rel for rel in release if rel['packagetype'] == 'sdist'] def _cleanup_release_artifacts(self, archive_path, directory_path): """Clean intermediary files which no longer needs to be present. """ if directory_path and os.path.exists(directory_path): logging.debug('Clean up uncompressed archive path %s' % ( directory_path, )) shutil.rmtree(directory_path) if archive_path and os.path.exists(archive_path): logging.debug('Clean up archive %s' % archive_path) os.unlink(archive_path) def _fetch_and_uncompress_releases(self, version, releases): """Fetch an uncompress sdist releases Args: version (str): Release name or version releases ([dict]): List of source distribution release artifacts Yields: tuple (release, filepath, uncompressed_path) """ for release in releases: # flatten the metadata to ease reading _flattenned_release = { 'name': version, 'message': release.get('comment_text', ''), 'sha256': release['digests']['sha256'], 'size': release['size'], 'filename': release['filename'], 'url': release['url'], 'date': release['upload_time'], } # fetch and write locally archives yield self.client.fetch_release_artifact( self.project, _flattenned_release) def releases(self): """Fetch metadata and data per release. This: - downloads and uncompresses the release artifacts. - yields the (version, release) - Clean up the intermediary fetched artifact files Yields: tuple (version, release_info, release, uncompressed_path) where: - release_info (dict): release's associated version info - author (dict): Author information for the release - release (dict): release metadata - uncompressed_path (str): Path to uncompressed artifact """ # The compute information per release releases_dict = self.data['releases'] for version, releases in releases_dict.items(): releases = self._filter_releases(version, releases) if not releases: logging.warn('%s %s: No source artifact found, skipping' % ( self.project, version)) continue _releases = self._fetch_and_uncompress_releases(version, releases) for _release, _archive, _dir_path, _pkginfo in _releases: _release_info = _pkginfo - if not _release_info: # fallback to pypi api metadata + if _release_info is None: # fallback to pypi api metadata msg = 'No PKG-INFO detected for %s, %s, skipping' % ( self.project, _release['name']) logging.warn(msg) _release_info = self.info(release_name=version) - _author = author(_release_info['info']) + _author = self.author(release_name=version) else: _author = author(_release_info) yield _release_info, _author, _release, _dir_path self._cleanup_release_artifacts(_archive, _dir_path)