diff --git a/swh/loader/pypi/client.py b/swh/loader/pypi/client.py index d1e4a48..d054b78 100644 --- a/swh/loader/pypi/client.py +++ b/swh/loader/pypi/client.py @@ -1,262 +1,259 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import hashlib import logging import os import requests from pkginfo import UnpackedSDist +from shutil import copyfile from swh.core import tarball from swh.model import hashutil try: from swh.loader.pypi._version import __version__ except ImportError: __version__ = 'devel' def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): if isinstance(h, bytes): checksums[key] = hashutil.hash_to_hex(h) else: checksums[key] = h return checksums def _to_dict(pkginfo): """Given a pkginfo file, convert it to a dict. """ m = {} for k in pkginfo: m[k] = getattr(pkginfo, k) return m def _project_pkginfo(dir_path): """Given an uncompressed path holding the pkginfo path, returns a pkginfo. """ project_dirname = os.listdir(dir_path)[0] pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') if not os.path.exists(pkginfo_path): return None pkginfo = UnpackedSDist(pkginfo_path) return _to_dict(pkginfo) class PyPiClient: """PyPi client in charge of discussing with the pypi server. """ def __init__(self, temp_directory=None, cache=False, cache_dir=None): self.version = __version__ self.temp_directory = temp_directory self.do_cache = cache if self.do_cache: self.cache_dir = cache_dir self.cache_raw_dir = os.path.join(cache_dir, 'archives') os.makedirs(self.cache_raw_dir, exist_ok=True) self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( __version__ ) } } def _save_response(self, response): """Log the response from a server request to a cache dir. Args: response (Response): full server response cache_dir (str): system path for cache dir Returns: nothing """ import gzip from json import dumps datepath = arrow.utcnow().isoformat() fname = os.path.join(self.cache_dir, datepath + '.gz') with gzip.open(fname, 'w') as f: f.write(bytes( dumps(response.json()), 'utf-8' )) def _save_raw(self, filepath): """In cache mode, backup the filepath to self.cache_raw_dir Args: filepath (str): Path of the file to save """ _filename = os.path.basename(filepath) _archive = os.path.join(self.cache_raw_dir, _filename) - with open(filepath, 'rb') as f: - with open(_archive, 'wb') as g: - for chunk in f: - g.write(chunk) + copyfile(filepath, _archive) def _get_raw(self, filepath): + """In cache mode, we try to retrieve the cached file. + + """ _filename = os.path.basename(filepath) _archive = os.path.join(self.cache_raw_dir, _filename) if not os.path.exists(_archive): return None - - with open(_archive, 'rb') as f: - with open(filepath, 'wb') as g: - for chunk in f: - g.write(chunk) + copyfile(_archive, filepath) return filepath def _get(self, url): """Get query to the url. Args: url (str): Url Raises: ValueError in case of failing to query Returns: Response as dict if ok """ response = self.session.get(url, **self.params) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) if self.do_cache: self._save_response(response) return response.json() def info(self, project_url): """Given a metadata project url, retrieve the raw json response Args: project_url (str): Project's pypi to retrieve information Returns: Main project information as dict. """ return self._get(project_url) def release(self, project, release): """Given a project and a release name, retrieve the raw information for said project's release. Args: project (str): Project's name release (dict): Release information Returns: Release information as dict """ release_url = 'https://pypi.org/pypi/%s/%s/json' % (project, release) return self._get(release_url) def fetch_release_artifact(self, project, release): """Fetch for a given release project the associated artifact. This: - fetches the artifact - checks the size, hashes match - uncompress the artifact locally - computes the swh hashes - returns the associated information for the artifact Args: project (str): Project's name release (dict): Release information Returns: tuple (release, archive_path, uncompress_archive_path) where: release is the release information (dict) archive_path is the fetch archive uncompressed_archive_path is the uncompressed archive """ version = release['name'] logging.debug('Release version: %s' % version) path = os.path.join(self.temp_directory, project, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, release['filename']) logging.debug('Release local path: %s' % filepath) _filepath = None if self.do_cache: _filepath = self._get_raw(filepath) if not _filepath: # no cache hit, we fetch from pypi url = release['url'] r = self.session.get(url, **self.params) if r.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, r.status_code)) _len = len(r.content) if _len != release['size']: raise ValueError('Error when checking size: %s != %s' % ( release['size'], _len)) # checking digest and writing h = hashlib.sha256() with open(filepath, 'wb') as f: for chunk in r.iter_content(): h.update(chunk) f.write(chunk) actual_digest = h.hexdigest() if actual_digest != release['sha256']: raise ValueError( 'Error when checking the hash checksum: %s != %s' % ( release['sha256'], actual_digest)) if self.do_cache: self._save_raw(filepath) release['filepath'] = filepath uncompress_path = os.path.join(path, 'uncompress') os.makedirs(uncompress_path, exist_ok=True) nature = tarball.uncompress(filepath, uncompress_path) artifact = convert_to_hex(hashutil.hash_path(filepath)) artifact['archive_type'] = nature for key, value in artifact.items(): release[key] = value pkginfo = _project_pkginfo(uncompress_path) return release, filepath, uncompress_path, pkginfo