diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index 2ee8e06..96ec9c6 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,183 +1,179 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import logging import os import shutil from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( release_identifier, revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from .client import PyPiClient from .model import PyPiProject TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' class PyPiLoader(SWHLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self): super().__init__(logging_class='swh.loader.pypi.PyPiLoader') self.origin_id = None self.temp_directory = self.config['temp_directory'] self.pypi_client = PyPiClient( temp_directory=self.temp_directory, cache=self.config['cache'], cache_dir=self.config['cache_dir']) self.debug = self.config['debug'] def pre_cleanup(self): """(override) To prevent disk explosion if some other workers exploded in mid-air (OOM killed), we try and clean up dangling files. """ clean_dangling_folders(self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """(override) Clean up temporary disk use """ if self.debug: self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % ( self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """(override) Prepare the origin visit information Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None # loader core will populate it def prepare(self, project_name, origin_url, origin_metadata_url=None): """(override) Keep reference to the origin url (project) and the project metadata url Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url self.project = PyPiProject(self.pypi_client, self.project_name, self.origin_metadata_url) def fetch_data(self): - """(override) This will fetch and prepare the needed releases. + """(override) Fetch and collect swh objects. """ - self.pypi_releases = self.project.releases() - - def store_data(self): - """(override) This collects the necessary objects information and send - them to storage. - - """ - _snapshot = { + self._snapshot = { 'branches': {} } + self._contents = [] + self._directories = [] + self._revisions = [] + self._releases = [] - _contents = [] - _directories = [] - _revisions = [] - _releases = [] - - for version, _release in self.pypi_releases: + for version, _release in self.project.releases(): info = self.project.info(version) author = self.project.author(version) logging.debug('author: %s' % author) release = _release['release'] _dir_path = release.pop('directory') _dir_path = _dir_path.encode('utf-8') directory = Directory.from_disk(path=_dir_path, data=True) _objects = directory.collect() - _contents.extend(_objects['content'].values()) - _directories.extend(_objects['directory'].values()) + self._contents.extend(_objects['content'].values()) + self._directories.extend(_objects['directory'].values()) date = normalize_timestamp( int(arrow.get(release['date']).timestamp)) name = release['name'].encode('utf-8') message = release['message'].encode('utf-8') _revision = { 'synthetic': True, 'metadata': { 'original_artifact': [release], 'project': info, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'name': name, 'message': message, 'directory': directory.hash, 'parents': [], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) - _revisions.append(_revision) + self._revisions.append(_revision) _release = { 'name': name, 'author': author, 'date': date, 'message': message, 'target_type': 'revision', 'target': _revision['id'], 'synthetic': False, } _release['id'] = identifier_to_bytes( release_identifier(_release)) - _releases.append(_release) + self._releases.append(_release) - _snapshot['branches'][name] = { + self._snapshot['branches'][name] = { 'target': _release['id'], 'target_type': 'release', } - _snapshot['id'] = identifier_to_bytes( - snapshot_identifier(_snapshot)) + self._snapshot['id'] = identifier_to_bytes( + snapshot_identifier(self._snapshot)) + + def store_data(self): + """(override) This sends collected objects to storage. - self.maybe_load_contents(_contents) - self.maybe_load_directories(_directories) - self.maybe_load_revisions(_revisions) - self.maybe_load_releases(_releases) - self.maybe_load_snapshot(_snapshot) + """ + self.maybe_load_contents(self._contents) + self.maybe_load_directories(self._directories) + self.maybe_load_revisions(self._revisions) + self.maybe_load_releases(self._releases) + self.maybe_load_snapshot(self._snapshot)