diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py index 88957a9..94fff20 100644 --- a/swh/loader/pypi/model.py +++ b/swh/loader/pypi/model.py @@ -1,148 +1,184 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging def info(data): """Given a dict of data, returns a project subset. """ info = data['info'] default = { 'home_page': info['home_page'], 'description': info['description'], 'summary': info['summary'], 'license': info['license'], 'package_url': info['package_url'], 'project_url': info['project_url'], 'upstream': None, } project_urls = info.get('project_urls') if project_urls: homepage = project_urls.get('Homepage') if homepage: default['upstream'] = homepage return default def author(data): """Given a dict of data, returns an author subset. """ name = data['info']['author'] email = data['info']['author_email'] if email: fullname = '%s <%s>' % (name, email) else: fullname = name return { 'fullname': fullname.encode('utf-8'), 'name': name.encode('utf-8'), 'email': email.encode('utf-8'), } class PyPiProject: """PyPi project representation This permits to extract information for the: - project, either the latest information (from the last revision) - either the information for a given release - Symmetrically for the release author information This also fetches and uncompress the associated release artifacts. """ def __init__(self, client, project, project_metadata_url, data=None): self.client = client self.project = project self.project_metadata_url = project_metadata_url if data: self.data = data else: self.data = client.info(project_metadata_url) self.last_version = self.data['info']['version'] self.cache = { self.last_version: self.data } def _data(self, release_name=None): """Fetch data per release and cache it. Returns the cache retrieved data if already fetched. """ if release_name: data = self.cache.get(release_name) if not data: data = self.client.release(self.project, release_name) self.cache[release_name] = data else: data = self.data return data def info(self, release_name=None): """Compute release information for release provided or the latest one. """ return info(self._data(release_name)) def author(self, release_name=None): """Compute author for the provided release if provided (use the latest release otherwise). """ return author(self._data(release_name)) + def _parse_release_artifact(self, version, release): + """Heuristically determine the release artifact to use as a release + file. + + Choose amongst package type 'sdist' (source) file + Others are not considered (yet?) + + """ + sdist = [] + # FIXME: there can be multiple 'package_type' here: + # sdist, bdist_egg, bdist_wheel, bdist_rpm, bdist_msi, bdist_wininst + if isinstance(release, list): + if len(release) > 1: + logging.warn('%s %s: Multiple release artifacts (%s)' % ( + self.project, version, [ + (rel['packagetype'], rel['filename']) + for rel in release])) + + sdist = [] + # Will try to filter on 'sdist' package type (source code) + for rel in release: + _type = rel['packagetype'] + if _type == 'sdist': + sdist.append(rel) + + if not sdist: + logging.warn('%s %s: No source artifact found, skipping' % ( + self.project, version)) + return + + if len(sdist) > 1: + logging.warn( + '%s %s: Multiple sdist files detected (%s)!' % ( + self.project, version, + ','.join([rel['filename'] for rel in sdist]) + )) + + # FIXME: take the first one? + release = release[0] + return release + def releases(self): """Fetch metadata and data per release. This downloads and uncompresses the release artifacts. Yields: tuple (version, release) """ # The compute information per release releases_dict = self.data['releases'] for version in releases_dict: release = releases_dict[version] if version == self.last_version: # avoid an extra query release_info = self.info() else: release_info = self.info(release_name=version) - # FIXME: there can be multiple 'package_type' here: - # sdist, bdist_egg, bdist_wheel - if isinstance(release, list): - if not release: - continue - if len(release) > 1: - raise ValueError( - 'Unsupported other formats for now, failing!') - release = release[0] + if not release: + continue + + release = self._parse_release_artifact(version, release) # flatten the metadata to ease reading _flattenned_release = { 'name': version, 'message': release['comment_text'], 'sha256': release['digests']['sha256'], 'size': release['size'], 'filename': release['filename'], 'url': release['url'], 'date': release['upload_time'], } # fetch and write locally archives _release = self.client.fetch_release_artifact( self.project, _flattenned_release) yield version, { 'info': release_info, 'release': _release, }