diff --git a/swh/loader/pypi/client.py b/swh/loader/pypi/client.py index 1f4bba1..36ac1ec 100644 --- a/swh/loader/pypi/client.py +++ b/swh/loader/pypi/client.py @@ -1,215 +1,162 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import hashlib import logging import os import requests from swh.core import tarball from swh.model import hashutil try: from swh.loader.pypi._version import __version__ except ImportError: __version__ = 'devel' def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): if isinstance(h, bytes): checksums[key] = hashutil.hash_to_hex(h) else: checksums[key] = h return checksums class PyPiClient: """PyPi client in charge of discussing with the pypi server. """ def __init__(self, temp_directory=None, cache=False, cache_dir=None): self.version = __version__ if not temp_directory: from tempfile import mkdtemp self.temp_directory = mkdtemp(dir=temp_directory, prefix='swh.loader.pypi.client') else: self.temp_directory = temp_directory self.do_cache = cache if self.do_cache: self.cache_dir = cache_dir os.makedirs(self.cache_dir, exist_ok=True) self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( __version__ ) } } def _save_response(self, response): """Log the response from a server request to a cache dir. Args: response: full server response cache_dir: system path for cache dir Returns: nothing """ import gzip from json import dumps datepath = arrow.utcnow().isoformat() fname = os.path.join(self.cache_dir, datepath + '.gz') with gzip.open(fname, 'w') as f: f.write(bytes( dumps(response.json()), 'utf-8' )) def _get(self, url): """Get query to the url. """ response = self.session.get(url, **self.params) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) if self.do_cache: self._save_response(response) return response.json() def info(self, project_url): """Given a metadata project url, retrieve the raw json response """ return self._get(project_url) def release(self, project, release): """Given a project and a release name, retrieve the raw json response """ release_url = 'https://pypi.org/pypi/%s/%s/json' % (project, release) return self._get(release_url) def fetch_release(self, project, release): version = release['name'] logging.debug('Release version: %s' % version) path = os.path.join(self.temp_directory, project, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, release['filename']) logging.debug('Release local path: %s' % filepath) url = release['url'] r = self.session.get(url, **self.params) if r.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, r.status_code)) # checks _len = len(r.content) if _len != release['size']: raise ValueError('Error when checking size: %s != %s' % ( release['size'], _len)) # checking digest and writing h = hashlib.sha256() with open(filepath, 'wb') as f: for chunk in r.iter_content(): h.update(chunk) f.write(chunk) actual_digest = h.hexdigest() if actual_digest != release['sha256']: raise ValueError( 'Error when checking the hash checksum: %s != %s' % ( release['sha256'], actual_digest)) uncompress_path = os.path.join(path, 'uncompress') os.makedirs(uncompress_path, exist_ok=True) nature = tarball.uncompress(filepath, uncompress_path) release['directory'] = uncompress_path artifact = convert_to_hex(hashutil.hash_path(filepath)) artifact['archive_type'] = nature for key, value in artifact.items(): release[key] = value return release - - def retrieve_releases(self, project, releases): - """Given a dictionary of releases, retrieve them locally. - - """ - # order the release in time order - _release_versions = list(releases.keys()) - _release_versions.sort() - - for version in _release_versions: - release = releases[version] - _release = release.copy() - logging.debug('Release version: %s' % version) - path = os.path.join(self.temp_directory, project, version) - os.makedirs(path, exist_ok=True) - filepath = os.path.join(path, release['filename']) - logging.debug('Release local path: %s' % filepath) - - r = self.session.get(release['url']) - if not r.ok: - raise ValueError('Fail to retrieve release %s' % version) - - # checks - _len = len(r.content) - if _len != release['size']: - raise ValueError('Error when checking size: %s != %s' % ( - release['size'], _len)) - - # checking digest and writing - h = hashlib.sha256() - with open(filepath, 'wb') as f: - for chunk in r.iter_content(): - h.update(chunk) - f.write(chunk) - - actual_digest = h.hexdigest() - if actual_digest != release['sha256']: - raise ValueError( - 'Error when checking the hash checksum: %s != %s' % ( - release['sha256'], actual_digest)) - - uncompress_path = os.path.join(path, 'uncompress') - os.makedirs(uncompress_path, exist_ok=True) - - nature = tarball.uncompress(filepath, uncompress_path) - _release['directory'] = uncompress_path - - artifact = convert_to_hex(hashutil.hash_path(filepath)) - artifact['archive_type'] = nature - for key, value in artifact.items(): - _release[key] = value - - yield version, _release