diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index 423c7ab..0824234 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,315 +1,332 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import arrow import hashlib import logging import os import requests import shutil from swh.core import tarball from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHStatelessLoader from swh.model import hashutil from swh.model.from_disk import Directory -from swh.model.identifiers import (release_identifier, revision_identifier, - snapshot_identifier, identifier_to_bytes) +from swh.model.identifiers import ( + release_identifier, revision_identifier, snapshot_identifier, + identifier_to_bytes, normalize_timestamp +) from .model import PyPiProject try: from swh.loader.pypi._version import __version__ except ImportError: __version__ = 'devel' TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): if isinstance(h, bytes): checksums[key] = hashutil.hash_to_hex(h) else: checksums[key] = h return checksums class PyPiClient: """PyPi client in charge of discussing with the pypi server. """ def __init__(self, temp_directory=None, cache=False, cache_dir=None): self.version = __version__ if not temp_directory: from tempfile import mkdtemp self.temp_directory = mkdtemp(dir=temp_directory, prefix='swh.loader.pypi.client') else: self.temp_directory = temp_directory self.do_cache = cache if self.do_cache: self.cache_dir = cache_dir os.makedirs(self.cache_dir, exist_ok=True) self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( __version__ ) } } def _save_response(self, response): """Log the response from a server request to a cache dir. Args: response: full server response cache_dir: system path for cache dir Returns: nothing """ import gzip from json import dumps - from arrow import utcnow - datepath = utcnow().isoformat() + datepath = arrow.utcnow().isoformat() fname = os.path.join(self.cache_dir, datepath + '.gz') with gzip.open(fname, 'w') as f: f.write(bytes( dumps(response.json()), 'UTF-8' )) def info(self, project_url): """Given a metadata project url, retrieve the raw json response """ response = self.session.get(project_url, **self.params) if response.status_code != 200: raise ValueError('Fail to load origin %s' % self.origin_url) if self.do_cache: self._save_response(response) return response.json() def retrieve_releases(self, project, releases): """Given a dictionary of releases, retrieve them locally. """ - _releases = releases.copy() - for version, release in releases.items(): - logging.debug('version: %s' % version) + # order the release in time order + _release_versions = list(releases.keys()) + _release_versions.sort() + + for version in _release_versions: + release = releases[version] + _release = release.copy() + logging.debug('Release version: %s' % version) path = os.path.join(self.temp_directory, project, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, release['filename']) - logging.debug('filepath to write: %s' % filepath) + logging.debug('Release local path: %s' % filepath) r = self.session.get(release['url']) if not r.ok: raise ValueError('Fail to retrieve release %s' % version) # checks _len = len(r.content) if _len != release['size']: raise ValueError('Error when checking size: %s != %s' % ( release['size'], _len)) # checking digest and writing h = hashlib.sha256() with open(filepath, 'wb') as f: for chunk in r.iter_content(): h.update(chunk) f.write(chunk) actual_digest = h.hexdigest() if actual_digest != release['sha256']: raise ValueError( 'Error when checking the hash checksum: %s != %s' % ( release['sha256'], actual_digest)) uncompress_path = os.path.join(path, 'uncompress') os.makedirs(uncompress_path, exist_ok=True) nature = tarball.uncompress(filepath, uncompress_path) - _releases[version]['directory'] = uncompress_path + _release['directory'] = uncompress_path artifact = convert_to_hex(hashutil.hash_path(filepath)) artifact['archive_type'] = nature for key, value in artifact.items(): - _releases[version][key] = value + _release[key] = value - return _releases + yield version, _release class PyPiLoader(SWHStatelessLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self): super().__init__(logging_class='swh.loader.pypi.PyPiLoader') self.origin_id = None self.temp_directory = self.config['temp_directory'] self.pypi_client = PyPiClient( temp_directory=self.temp_directory, cache=self.config['cache'], cache_dir=self.config['cache_dir']) self.debug = self.config['debug'] def pre_cleanup(self): """(override) To prevent disk explosion... """ clean_dangling_folders(self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """(override) Clean up temporary disk use """ if self.debug: self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % ( self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """(override) Prepare the origin visit information """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None def prepare(self, project_name, origin_url, origin_metadata_url=None): """(override) Keep reference to the origin url (project) and the project metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url def get_contents(self): - return self.contents + return self._contents def get_directories(self): - return self.directories() + return self._directories def get_revisions(self): - return self.revisions + return self._revisions def get_releases(self): - return self.releases + return self._releases def get_snapshot(self): - return self.snapshot + return self._snapshot def fetch_data(self): - """(override) Retrieve the pypi origin's information + """(override) Compute pypi data: + + - 1. Retrieve project information + - 2. Fetch the releases and uncompress them + - 3. Collection object information (contents, directories, + revisions, releases, snapshot) """ project_info = self.pypi_client.info(self.origin_metadata_url) project = PyPiProject(project_info) releases = self.pypi_client.retrieve_releases( self.project_name, project.releases()) info = project.info() author = project.author() _contents = [] _directories = [] _revisions = [] _releases = [] _snapshot = { 'branches': {} } - # for each - for version, release in releases.items(): + _last_rev = None + + for version, release in releases: _dir_path = release.pop('directory') - directory = Directory.from_disk(path=_dir_path.encode('utf-8'), - save_path=True) + _dir_path = _dir_path.encode('utf-8') + directory = Directory.from_disk(path=_dir_path, data=True) _objects = directory.collect() - _contents.append(_objects['content'].values()) - _directories.append(_objects['directory'].values()) + _contents.extend(_objects['content'].values()) + _directories.extend(_objects['directory'].values()) + date = normalize_timestamp( + int(arrow.get(release['date']).timestamp)) + + name = release['name'].encode('utf-8') + message = release['message'].encode('utf-8') _revision = { 'synthetic': True, 'metadata': { 'original_artifact': [release], 'project': info, }, 'author': author, - 'date': release['date'], + 'date': date, 'committer': author, - 'committer_date': release['date'], - 'name': release['name'], - 'message': release['message'], + 'committer_date': date, + 'name': name, + 'message': message, 'directory': directory.hash, - 'parents': [], + 'parents': [] if _last_rev is None else [_last_rev['id']], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) _revisions.append(_revision) + _last_rev = _revision _release = { - 'name': release['name'], + 'name': name, 'author': author, - 'date': release['date'], - 'message': release['message'], + 'date': date, + 'message': message, 'target_type': 'revision', 'target': _revision['id'], + 'synthetic': False, } _release['id'] = identifier_to_bytes( release_identifier(_release)) _releases.append(_release) - _snapshot['branches'][release['name']] = { + _snapshot['branches'][name] = { 'target': _release['id'], 'target_type': 'release', } - logging.debug('version: %s' % version) - logging.debug('release: %s' % release['directory']) - _snapshot['id'] = identifier_to_bytes( snapshot_identifier(_snapshot)) - self.contents = _contents - self.directories = _directories - self.revisions = _revisions - self.releases = _releases - self.snapshot = _snapshot + self._contents = _contents + self._directories = _directories + self._revisions = _revisions + self._releases = _releases + self._snapshot = _snapshot diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py index 365255c..89213cd 100644 --- a/swh/loader/pypi/model.py +++ b/swh/loader/pypi/model.py @@ -1,50 +1,58 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + + class PyPiProject: """PyPi project representation """ def __init__(self, data): self.data = data def info(self): return { 'home_page': self.data['info']['home_page'], 'description': self.data['info']['description'], 'summary': self.data['info']['summary'], 'license': self.data['info']['license'], 'package_url': self.data['info']['package_url'], 'project_url': self.data['info']['project_url'], 'upstream': self.data['info']['project_urls']['Homepage'], } def author(self): + name = self.data['info']['author'].encode('utf-8') + email = self.data['info']['author_email'].encode('utf-8') return { - 'fullname': self.data['info']['author'], - 'name': self.data['info']['author'], - 'email': self.data['info']['author_email'] + 'fullname': name, + 'name': name, + 'email': email, } def releases(self): releases = {} for version, release in self.data['releases'].items(): + logging.debug('version: %s, release: %s' % (version, release)) + # FIXME: there can be multiple 'package_type' here: + # sdist, bdist_egg, bdist_wheel if isinstance(release, list): if len(release) > 1: raise ValueError( # unexpected so fail so that we # can fix later 'Unexpected list of more than 1 element, failing!') release = release[0] releases[version] = { 'name': version, 'message': release['comment_text'], 'sha256': release['digests']['sha256'], 'size': release['size'], 'filename': release['filename'], 'url': release['url'], 'date': release['upload_time'], } return releases diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py index 6e2444e..4f143a0 100644 --- a/swh/loader/pypi/tests/test_model.py +++ b/swh/loader/pypi/tests/test_model.py @@ -1,86 +1,88 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from unittest import TestCase from nose.tools import istest from swh.loader.pypi.model import PyPiProject class ModelTest(TestCase): def setUp(self): with open('./swh/loader/pypi/tests/test_model_data.json') as f: self.data = json.load(f) self.project = PyPiProject(self.data) @istest def info(self): actual_info = self.project.info() expected_info = { 'home_page': self.data['info']['home_page'], 'description': self.data['info']['description'], 'summary': self.data['info']['summary'], 'license': self.data['info']['license'], 'package_url': self.data['info']['package_url'], 'project_url': self.data['info']['project_url'], 'upstream': self.data['info']['project_urls']['Homepage'], } self.assertEqual(expected_info, actual_info) @istest def author(self): actual_author = self.project.author() + name = self.data['info']['author'].encode('utf-8') + email = self.data['info']['author_email'].encode('utf-8') expected_author = { - 'fullname': self.data['info']['author'], - 'name': self.data['info']['author'], - 'email': self.data['info']['author_email'], + 'fullname': name, + 'name': name, + 'email': email, } self.assertEqual(expected_author, actual_author) @istest def releases(self): actual_releases = self.project.releases() release0 = self.data['releases']['0.1'][0] release1 = self.data['releases']['0.1.1'][0] self.maxDiff = None expected_releases = { '0.1': { 'name': '0.1', 'message': release0['comment_text'], 'sha256': release0['digests']['sha256'], 'size': release0['size'], 'filename': release0['filename'], 'url': release0['url'], 'date': release0['upload_time'], }, '0.1.1': { 'name': '0.1.1', 'message': release1['comment_text'], 'sha256': release1['digests']['sha256'], 'size': release1['size'], 'filename': release1['filename'], 'url': release1['url'], 'date': release1['upload_time'], } } self.assertEqual(expected_releases, actual_releases) @istest def releases_unexpected_release_format(self): data = self.data.copy() data['releases']['0.1'].append({'anything': 'really to break'}) with self.assertRaisesRegex(ValueError, 'Unexpected list of more than 1'): self.project.releases()