diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index e6a3545..7055584 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,176 +1,172 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import logging import os import shutil from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( release_identifier, revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from .client import PyPiClient from .model import PyPiProject TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' class PyPiLoader(SWHLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self): super().__init__(logging_class='swh.loader.pypi.PyPiLoader') self.origin_id = None self.temp_directory = self.config['temp_directory'] self.pypi_client = PyPiClient( temp_directory=self.temp_directory, cache=self.config['cache'], cache_dir=self.config['cache_dir']) self.debug = self.config['debug'] def pre_cleanup(self): """(override) To prevent disk explosion... """ clean_dangling_folders(self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """(override) Clean up temporary disk use """ if self.debug: self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % ( self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """(override) Prepare the origin visit information """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None def prepare(self, project_name, origin_url, origin_metadata_url=None): """(override) Keep reference to the origin url (project) and the project metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url self.project = PyPiProject(self.pypi_client, self.project_name, self.origin_metadata_url) def fetch_data(self): """(override) This will fetch and prepare the needed releases. """ self.pypi_releases = self.project.releases() def store_data(self): """(override) This collects the necessary objects information and send them to storage. """ _snapshot = { 'branches': {} } - _last_rev = None - _contents = [] _directories = [] _revisions = [] _releases = [] for version, _release in self.pypi_releases: info = self.project.info(version) author = self.project.author(version) logging.debug('author: %s' % author) release = _release['release'] _dir_path = release.pop('directory') _dir_path = _dir_path.encode('utf-8') directory = Directory.from_disk(path=_dir_path, data=True) _objects = directory.collect() _contents.extend(_objects['content'].values()) _directories.extend(_objects['directory'].values()) date = normalize_timestamp( int(arrow.get(release['date']).timestamp)) name = release['name'].encode('utf-8') message = release['message'].encode('utf-8') _revision = { 'synthetic': True, 'metadata': { 'original_artifact': [release], 'project': info, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'name': name, 'message': message, 'directory': directory.hash, - 'parents': [] if _last_rev is None else [_last_rev['id']], + 'parents': [], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) _revisions.append(_revision) - _last_rev = _revision - _release = { 'name': name, 'author': author, 'date': date, 'message': message, 'target_type': 'revision', 'target': _revision['id'], 'synthetic': False, } _release['id'] = identifier_to_bytes( release_identifier(_release)) _releases.append(_release) _snapshot['branches'][name] = { 'target': _release['id'], 'target_type': 'release', } _snapshot['id'] = identifier_to_bytes( snapshot_identifier(_snapshot)) self.maybe_load_contents(_contents) self.maybe_load_directories(_directories) self.maybe_load_revisions(_revisions) self.maybe_load_releases(_releases) self.maybe_load_snapshot(_snapshot) diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py index d5cff92..0b853fb 100644 --- a/swh/loader/pypi/model.py +++ b/swh/loader/pypi/model.py @@ -1,145 +1,130 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging def info(data): """Given a dict of data, returns a project subset. """ info = data['info'] default = { 'home_page': info['home_page'], 'description': info['description'], 'summary': info['summary'], 'license': info['license'], 'package_url': info['package_url'], 'project_url': info['project_url'], 'upstream': None, } project_urls = info.get('project_urls') if project_urls: homepage = project_urls.get('Homepage') if homepage: default['upstream'] = homepage return default def author(data): """Given a dict of data, returns an author subset. """ name = data['info']['author'] email = data['info']['author_email'] if email: fullname = '%s <%s>' % (name, email) else: fullname = name return { 'fullname': fullname.encode('utf-8'), 'name': name.encode('utf-8'), 'email': email.encode('utf-8'), } class PyPiProject: """PyPi project representation This permits to extract information for the: - project, either the latest information (from the last revision) - project information for a given release - same for author information """ def __init__(self, client, project, project_metadata_url, data=None): self.client = client self.project = project self.project_metadata_url = project_metadata_url if data: self.data = data else: self.data = client.info(project_metadata_url) self.last_version = self.data['info']['version'] self.cache = { self.last_version: self.data } def _data(self, release_name=None): if release_name: data = self.cache.get(release_name) if not data: data = self.client.release(self.project, release_name) self.cache[release_name] = data else: data = self.data return data def info(self, release_name=None): return info(self._data(release_name)) def author(self, release_name=None): return author(self._data(release_name)) def fetch_release(self, release_name=None): pass - def _sort_releases(self, releases_dict): - # sort releases in ascending order - _releases = list(releases_dict.keys()) - __releases = [] - try: - for _release in _releases: - __releases.append([int(i) for i in _release.split('.')]) - __releases.sort() - for rel in __releases: - yield '.'.join((str(i) for i in rel)) - except Exception: - logging.exception('Sort release simply using string order...') - _releases.sort() - return _releases - def releases(self): # The compute information per release releases_dict = self.data['releases'] - for version in self._sort_releases(releases_dict): + for version in releases_dict: release = releases_dict[version] if version == self.last_version: # avoid an extra query release_info = self.info() else: release_info = self.info(release_name=version) # FIXME: there can be multiple 'package_type' here: # sdist, bdist_egg, bdist_wheel if isinstance(release, list): if not release: continue if len(release) > 1: raise ValueError( 'Unsupported other formats for now, failing!') release = release[0] # flatten the metadata to ease reading _flattenned_release = { 'name': version, 'message': release['comment_text'], 'sha256': release['digests']['sha256'], 'size': release['size'], 'filename': release['filename'], 'url': release['url'], 'date': release['upload_time'], } # fetch and write locally archives _release = self.client.fetch_release( self.project, _flattenned_release) yield version, { 'info': release_info, 'release': _release, }