diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index ba97c47..e6a3545 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,194 +1,176 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import logging import os import shutil from swh.loader.core.utils import clean_dangling_folders -from swh.loader.core.loader import SWHStatelessLoader +from swh.loader.core.loader import SWHLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( release_identifier, revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from .client import PyPiClient from .model import PyPiProject TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' -class PyPiLoader(SWHStatelessLoader): +class PyPiLoader(SWHLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self): super().__init__(logging_class='swh.loader.pypi.PyPiLoader') self.origin_id = None self.temp_directory = self.config['temp_directory'] self.pypi_client = PyPiClient( temp_directory=self.temp_directory, cache=self.config['cache'], cache_dir=self.config['cache_dir']) self.debug = self.config['debug'] def pre_cleanup(self): """(override) To prevent disk explosion... """ clean_dangling_folders(self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """(override) Clean up temporary disk use """ if self.debug: self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % ( self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """(override) Prepare the origin visit information """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None def prepare(self, project_name, origin_url, origin_metadata_url=None): """(override) Keep reference to the origin url (project) and the project metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url self.project = PyPiProject(self.pypi_client, self.project_name, self.origin_metadata_url) - def get_contents(self): - return self._contents - - def get_directories(self): - return self._directories - - def get_revisions(self): - return self._revisions - - def get_releases(self): - return self._releases - - def get_snapshot(self): - return self._snapshot - def fetch_data(self): - """(override) Compute pypi data: - - - 1. Retrieve project information - - 2. Fetch the releases and uncompress them - - 3. Collection object information (contents, directories, - revisions, releases, snapshot) + """(override) This will fetch and prepare the needed releases. """ - pypi_releases = self.project.releases() + self.pypi_releases = self.project.releases() - _contents = [] - _directories = [] - _revisions = [] - _releases = [] + def store_data(self): + """(override) This collects the necessary objects information and send + them to storage. + + """ _snapshot = { 'branches': {} } _last_rev = None - for version, _release in pypi_releases: + _contents = [] + _directories = [] + _revisions = [] + _releases = [] + + for version, _release in self.pypi_releases: info = self.project.info(version) author = self.project.author(version) logging.debug('author: %s' % author) release = _release['release'] _dir_path = release.pop('directory') _dir_path = _dir_path.encode('utf-8') directory = Directory.from_disk(path=_dir_path, data=True) _objects = directory.collect() _contents.extend(_objects['content'].values()) _directories.extend(_objects['directory'].values()) date = normalize_timestamp( int(arrow.get(release['date']).timestamp)) name = release['name'].encode('utf-8') message = release['message'].encode('utf-8') _revision = { 'synthetic': True, 'metadata': { 'original_artifact': [release], 'project': info, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'name': name, 'message': message, 'directory': directory.hash, 'parents': [] if _last_rev is None else [_last_rev['id']], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) _revisions.append(_revision) + _last_rev = _revision _release = { 'name': name, 'author': author, 'date': date, 'message': message, 'target_type': 'revision', 'target': _revision['id'], 'synthetic': False, } _release['id'] = identifier_to_bytes( release_identifier(_release)) _releases.append(_release) _snapshot['branches'][name] = { 'target': _release['id'], 'target_type': 'release', } _snapshot['id'] = identifier_to_bytes( snapshot_identifier(_snapshot)) - logging.debug('contents: %s' % len(_contents)) - logging.debug('directories: %s' % len(_directories)) - logging.debug('revisions: %s' % len(_revisions)) - logging.debug('releases: %s' % len(_releases)) - - self._contents = _contents - self._directories = _directories - self._revisions = _revisions - self._releases = _releases - self._snapshot = _snapshot + self.maybe_load_contents(_contents) + self.maybe_load_directories(_directories) + self.maybe_load_revisions(_revisions) + self.maybe_load_releases(_releases) + self.maybe_load_snapshot(_snapshot)