Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py
index 2ee8e06..96ec9c6 100644
--- a/swh/loader/pypi/loader.py
+++ b/swh/loader/pypi/loader.py
@@ -1,183 +1,179 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import arrow
import logging
import os
import shutil
from swh.loader.core.utils import clean_dangling_folders
from swh.loader.core.loader import SWHLoader
from swh.model.from_disk import Directory
from swh.model.identifiers import (
release_identifier, revision_identifier, snapshot_identifier,
identifier_to_bytes, normalize_timestamp
)
from .client import PyPiClient
from .model import PyPiProject
TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.'
class PyPiLoader(SWHLoader):
CONFIG_BASE_FILENAME = 'loader/pypi'
ADDITIONAL_CONFIG = {
'temp_directory': ('str', '/tmp/swh.loader.pypi/'),
'cache': ('bool', False),
'cache_dir': ('str', ''),
'debug': ('bool', False), # NOT FOR PRODUCTION
}
def __init__(self):
super().__init__(logging_class='swh.loader.pypi.PyPiLoader')
self.origin_id = None
self.temp_directory = self.config['temp_directory']
self.pypi_client = PyPiClient(
temp_directory=self.temp_directory,
cache=self.config['cache'],
cache_dir=self.config['cache_dir'])
self.debug = self.config['debug']
def pre_cleanup(self):
"""(override) To prevent disk explosion if some other workers exploded
in mid-air (OOM killed), we try and clean up dangling files.
"""
clean_dangling_folders(self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log)
def cleanup(self):
"""(override) Clean up temporary disk use
"""
if self.debug:
self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % (
self.temp_directory
))
return
if os.path.exists(self.temp_directory):
self.log.debug('Clean up %s' % self.temp_directory)
shutil.rmtree(self.temp_directory)
def prepare_origin_visit(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Prepare the origin visit information
Args:
project_name (str): Project's simple name
origin_url (str): Project's main url
origin_metadata_url (str): Project's metadata url
"""
self.origin = {
'url': origin_url,
'type': 'pypi',
}
self.visit_date = None # loader core will populate it
def prepare(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Keep reference to the origin url (project) and the
project metadata url
Args:
project_name (str): Project's simple name
origin_url (str): Project's main url
origin_metadata_url (str): Project's metadata url
"""
self.project_name = project_name
self.origin_url = origin_url
self.origin_metadata_url = origin_metadata_url
self.project = PyPiProject(self.pypi_client, self.project_name,
self.origin_metadata_url)
def fetch_data(self):
- """(override) This will fetch and prepare the needed releases.
+ """(override) Fetch and collect swh objects.
"""
- self.pypi_releases = self.project.releases()
-
- def store_data(self):
- """(override) This collects the necessary objects information and send
- them to storage.
-
- """
- _snapshot = {
+ self._snapshot = {
'branches': {}
}
+ self._contents = []
+ self._directories = []
+ self._revisions = []
+ self._releases = []
- _contents = []
- _directories = []
- _revisions = []
- _releases = []
-
- for version, _release in self.pypi_releases:
+ for version, _release in self.project.releases():
info = self.project.info(version)
author = self.project.author(version)
logging.debug('author: %s' % author)
release = _release['release']
_dir_path = release.pop('directory')
_dir_path = _dir_path.encode('utf-8')
directory = Directory.from_disk(path=_dir_path, data=True)
_objects = directory.collect()
- _contents.extend(_objects['content'].values())
- _directories.extend(_objects['directory'].values())
+ self._contents.extend(_objects['content'].values())
+ self._directories.extend(_objects['directory'].values())
date = normalize_timestamp(
int(arrow.get(release['date']).timestamp))
name = release['name'].encode('utf-8')
message = release['message'].encode('utf-8')
_revision = {
'synthetic': True,
'metadata': {
'original_artifact': [release],
'project': info,
},
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'name': name,
'message': message,
'directory': directory.hash,
'parents': [],
'type': 'tar',
}
_revision['id'] = identifier_to_bytes(
revision_identifier(_revision))
- _revisions.append(_revision)
+ self._revisions.append(_revision)
_release = {
'name': name,
'author': author,
'date': date,
'message': message,
'target_type': 'revision',
'target': _revision['id'],
'synthetic': False,
}
_release['id'] = identifier_to_bytes(
release_identifier(_release))
- _releases.append(_release)
+ self._releases.append(_release)
- _snapshot['branches'][name] = {
+ self._snapshot['branches'][name] = {
'target': _release['id'],
'target_type': 'release',
}
- _snapshot['id'] = identifier_to_bytes(
- snapshot_identifier(_snapshot))
+ self._snapshot['id'] = identifier_to_bytes(
+ snapshot_identifier(self._snapshot))
+
+ def store_data(self):
+ """(override) This sends collected objects to storage.
- self.maybe_load_contents(_contents)
- self.maybe_load_directories(_directories)
- self.maybe_load_revisions(_revisions)
- self.maybe_load_releases(_releases)
- self.maybe_load_snapshot(_snapshot)
+ """
+ self.maybe_load_contents(self._contents)
+ self.maybe_load_directories(self._directories)
+ self.maybe_load_revisions(self._revisions)
+ self.maybe_load_releases(self._releases)
+ self.maybe_load_snapshot(self._snapshot)

File Metadata

Mime Type
text/x-diff
Expires
Sat, Jun 21, 9:04 PM (4 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3342169

Event Timeline