diff --git a/debian/control b/debian/control index e841fc9..8d85b30 100644 --- a/debian/control +++ b/debian/control @@ -1,24 +1,26 @@ Source: swh-loader-pypi Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, + python3-arrow, python3-nose, + python3-requests, python3-setuptools, python3-swh.core, python3-swh.storage, python3-swh.scheduler, python3-swh.loader.core, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-loader-pypi.git Package: python3-swh.loader.pypi Architecture: all Depends: python3-swh.core, python3-swh.loader.core, python3-swh.storage, ${misc:Depends}, ${python3:Depends} Description: Software Heritage PyPi Loader diff --git a/requirements.txt b/requirements.txt index ae22f85..4691d16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ setuptools vcversioner +requests +arrow diff --git a/setup.py b/setup.py index 9232956..c1edd31 100755 --- a/setup.py +++ b/setup.py @@ -1,40 +1,40 @@ #!/usr/bin/env python3 import os from setuptools import setup, find_packages def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.pypi', description='Software Heritage PyPi Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/source/swh-loader-pypi.git', packages=find_packages(), scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements('swh'), test_requires=parse_requirements('test'), setup_requires=['vcversioner'], - vcversioner={}, + vcversioner={'version_module_paths': ['swh/loader/pypi/_version.py']}, include_package_data=True, ) diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py index e69de29..69e3be5 100644 --- a/swh/loader/__init__.py +++ b/swh/loader/__init__.py @@ -0,0 +1 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py new file mode 100644 index 0000000..423c7ab --- /dev/null +++ b/swh/loader/pypi/loader.py @@ -0,0 +1,315 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +import logging +import os +import requests +import shutil + +from swh.core import tarball +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import SWHStatelessLoader +from swh.model import hashutil +from swh.model.from_disk import Directory +from swh.model.identifiers import (release_identifier, revision_identifier, + snapshot_identifier, identifier_to_bytes) + +from .model import PyPiProject + +try: + from swh.loader.pypi._version import __version__ +except ImportError: + __version__ = 'devel' + + +TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' + + +def convert_to_hex(d): + """Convert a flat dictionary with bytes in values to the same dictionary + with hex as values. + + Args: + dict: flat dictionary with sha bytes in their values. + + Returns: + Mirror dictionary with values as string hex. + + """ + if not d: + return d + + checksums = {} + for key, h in d.items(): + if isinstance(h, bytes): + checksums[key] = hashutil.hash_to_hex(h) + else: + checksums[key] = h + + return checksums + + +class PyPiClient: + """PyPi client in charge of discussing with the pypi server. + + """ + def __init__(self, temp_directory=None, cache=False, cache_dir=None): + self.version = __version__ + if not temp_directory: + from tempfile import mkdtemp + self.temp_directory = mkdtemp(dir=temp_directory, + prefix='swh.loader.pypi.client') + else: + self.temp_directory = temp_directory + + self.do_cache = cache + if self.do_cache: + self.cache_dir = cache_dir + os.makedirs(self.cache_dir, exist_ok=True) + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( + __version__ + ) + } + } + + def _save_response(self, response): + """Log the response from a server request to a cache dir. + + Args: + response: full server response + cache_dir: system path for cache dir + Returns: + nothing + """ + import gzip + from json import dumps + from arrow import utcnow + datepath = utcnow().isoformat() + fname = os.path.join(self.cache_dir, datepath + '.gz') + with gzip.open(fname, 'w') as f: + f.write(bytes( + dumps(response.json()), + 'UTF-8' + )) + + def info(self, project_url): + """Given a metadata project url, retrieve the raw json response + + """ + response = self.session.get(project_url, **self.params) + if response.status_code != 200: + raise ValueError('Fail to load origin %s' % self.origin_url) + + if self.do_cache: + self._save_response(response) + + return response.json() + + def retrieve_releases(self, project, releases): + """Given a dictionary of releases, retrieve them locally. + + """ + _releases = releases.copy() + for version, release in releases.items(): + logging.debug('version: %s' % version) + path = os.path.join(self.temp_directory, project, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, release['filename']) + logging.debug('filepath to write: %s' % filepath) + + r = self.session.get(release['url']) + if not r.ok: + raise ValueError('Fail to retrieve release %s' % version) + + # checks + _len = len(r.content) + if _len != release['size']: + raise ValueError('Error when checking size: %s != %s' % ( + release['size'], _len)) + + # checking digest and writing + h = hashlib.sha256() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(): + h.update(chunk) + f.write(chunk) + + actual_digest = h.hexdigest() + if actual_digest != release['sha256']: + raise ValueError( + 'Error when checking the hash checksum: %s != %s' % ( + release['sha256'], actual_digest)) + + uncompress_path = os.path.join(path, 'uncompress') + os.makedirs(uncompress_path, exist_ok=True) + + nature = tarball.uncompress(filepath, uncompress_path) + _releases[version]['directory'] = uncompress_path + + artifact = convert_to_hex(hashutil.hash_path(filepath)) + artifact['archive_type'] = nature + for key, value in artifact.items(): + _releases[version][key] = value + + return _releases + + +class PyPiLoader(SWHStatelessLoader): + CONFIG_BASE_FILENAME = 'loader/pypi' + ADDITIONAL_CONFIG = { + 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), + 'cache': ('bool', False), + 'cache_dir': ('str', ''), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + + def __init__(self): + super().__init__(logging_class='swh.loader.pypi.PyPiLoader') + self.origin_id = None + self.temp_directory = self.config['temp_directory'] + self.pypi_client = PyPiClient( + temp_directory=self.temp_directory, + cache=self.config['cache'], + cache_dir=self.config['cache_dir']) + self.debug = self.config['debug'] + + def pre_cleanup(self): + """(override) To prevent disk explosion... + + """ + clean_dangling_folders(self.temp_directory, + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def cleanup(self): + """(override) Clean up temporary disk use + + """ + if self.debug: + self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % ( + self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) + + def prepare_origin_visit(self, project_name, origin_url, + origin_metadata_url=None): + """(override) Prepare the origin visit information + + """ + self.origin = { + 'url': origin_url, + 'type': 'pypi', + } + self.visit_date = None + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + """(override) Keep reference to the origin url (project) and the + project metadata url + + """ + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + + def get_contents(self): + return self.contents + + def get_directories(self): + return self.directories() + + def get_revisions(self): + return self.revisions + + def get_releases(self): + return self.releases + + def get_snapshot(self): + return self.snapshot + + def fetch_data(self): + """(override) Retrieve the pypi origin's information + + """ + project_info = self.pypi_client.info(self.origin_metadata_url) + project = PyPiProject(project_info) + releases = self.pypi_client.retrieve_releases( + self.project_name, project.releases()) + info = project.info() + author = project.author() + + _contents = [] + _directories = [] + _revisions = [] + _releases = [] + _snapshot = { + 'branches': {} + } + + # for each + for version, release in releases.items(): + _dir_path = release.pop('directory') + directory = Directory.from_disk(path=_dir_path.encode('utf-8'), + save_path=True) + _objects = directory.collect() + + _contents.append(_objects['content'].values()) + _directories.append(_objects['directory'].values()) + + _revision = { + 'synthetic': True, + 'metadata': { + 'original_artifact': [release], + 'project': info, + }, + 'author': author, + 'date': release['date'], + 'committer': author, + 'committer_date': release['date'], + 'name': release['name'], + 'message': release['message'], + 'directory': directory.hash, + 'parents': [], + 'type': 'tar', + } + _revision['id'] = identifier_to_bytes( + revision_identifier(_revision)) + _revisions.append(_revision) + + _release = { + 'name': release['name'], + 'author': author, + 'date': release['date'], + 'message': release['message'], + 'target_type': 'revision', + 'target': _revision['id'], + } + _release['id'] = identifier_to_bytes( + release_identifier(_release)) + _releases.append(_release) + + _snapshot['branches'][release['name']] = { + 'target': _release['id'], + 'target_type': 'release', + } + + logging.debug('version: %s' % version) + logging.debug('release: %s' % release['directory']) + + _snapshot['id'] = identifier_to_bytes( + snapshot_identifier(_snapshot)) + + self.contents = _contents + self.directories = _directories + self.revisions = _revisions + self.releases = _releases + self.snapshot = _snapshot diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py new file mode 100644 index 0000000..365255c --- /dev/null +++ b/swh/loader/pypi/model.py @@ -0,0 +1,50 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class PyPiProject: + """PyPi project representation + + """ + def __init__(self, data): + self.data = data + + def info(self): + return { + 'home_page': self.data['info']['home_page'], + 'description': self.data['info']['description'], + 'summary': self.data['info']['summary'], + 'license': self.data['info']['license'], + 'package_url': self.data['info']['package_url'], + 'project_url': self.data['info']['project_url'], + 'upstream': self.data['info']['project_urls']['Homepage'], + } + + def author(self): + return { + 'fullname': self.data['info']['author'], + 'name': self.data['info']['author'], + 'email': self.data['info']['author_email'] + } + + def releases(self): + releases = {} + for version, release in self.data['releases'].items(): + if isinstance(release, list): + if len(release) > 1: + raise ValueError( # unexpected so fail so that we + # can fix later + 'Unexpected list of more than 1 element, failing!') + release = release[0] + releases[version] = { + 'name': version, + 'message': release['comment_text'], + 'sha256': release['digests']['sha256'], + 'size': release['size'], + 'filename': release['filename'], + 'url': release['url'], + 'date': release['upload_time'], + } + return releases diff --git a/swh/loader/pypi/tasks.py b/swh/loader/pypi/tasks.py new file mode 100644 index 0000000..f0792f1 --- /dev/null +++ b/swh/loader/pypi/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.task import Task + +from .loader import PyPiLoader + + +class LoadPyPiTsk(Task): + task_queue = 'swh_loader_pypi' + + def run_task(self, project_name, project_url, project_metadata_url=None): + loader = PyPiLoader() + loader.log = self.log + return loader.load(project_name, + project_url, + origin_metadata_url=project_metadata_url) diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py new file mode 100644 index 0000000..6e2444e --- /dev/null +++ b/swh/loader/pypi/tests/test_model.py @@ -0,0 +1,86 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from unittest import TestCase +from nose.tools import istest + +from swh.loader.pypi.model import PyPiProject + + +class ModelTest(TestCase): + + def setUp(self): + with open('./swh/loader/pypi/tests/test_model_data.json') as f: + self.data = json.load(f) + self.project = PyPiProject(self.data) + + @istest + def info(self): + actual_info = self.project.info() + + expected_info = { + 'home_page': self.data['info']['home_page'], + 'description': self.data['info']['description'], + 'summary': self.data['info']['summary'], + 'license': self.data['info']['license'], + 'package_url': self.data['info']['package_url'], + 'project_url': self.data['info']['project_url'], + 'upstream': self.data['info']['project_urls']['Homepage'], + } + + self.assertEqual(expected_info, actual_info) + + @istest + def author(self): + actual_author = self.project.author() + + expected_author = { + 'fullname': self.data['info']['author'], + 'name': self.data['info']['author'], + 'email': self.data['info']['author_email'], + } + + self.assertEqual(expected_author, actual_author) + + @istest + def releases(self): + actual_releases = self.project.releases() + + release0 = self.data['releases']['0.1'][0] + release1 = self.data['releases']['0.1.1'][0] + self.maxDiff = None + expected_releases = { + '0.1': { + 'name': '0.1', + 'message': release0['comment_text'], + 'sha256': release0['digests']['sha256'], + 'size': release0['size'], + 'filename': release0['filename'], + 'url': release0['url'], + 'date': release0['upload_time'], + }, + '0.1.1': { + 'name': '0.1.1', + 'message': release1['comment_text'], + 'sha256': release1['digests']['sha256'], + 'size': release1['size'], + 'filename': release1['filename'], + 'url': release1['url'], + 'date': release1['upload_time'], + } + } + + self.assertEqual(expected_releases, actual_releases) + + @istest + def releases_unexpected_release_format(self): + data = self.data.copy() + data['releases']['0.1'].append({'anything': 'really to break'}) + + with self.assertRaisesRegex(ValueError, + 'Unexpected list of more than 1'): + self.project.releases() diff --git a/swh/loader/pypi/tests/test_model_data.json b/swh/loader/pypi/tests/test_model_data.json new file mode 100644 index 0000000..17a02e5 --- /dev/null +++ b/swh/loader/pypi/tests/test_model_data.json @@ -0,0 +1,95 @@ +{ + "info": { + "author": "bernardfrk", + "author_email": "bernard.frk@gmail.com", + "bugtrack_url": null, + "classifiers": [], + "description": "Utitilies to use the 7xydothis APIs", + "description_content_type": null, + "docs_url": null, + "download_url": "UNKNOWN", + "downloads": { + "last_day": -1, + "last_month": -1, + "last_week": -1 + }, + "home_page": "https://github.com/frkb/7xydothis", + "keywords": null, + "license": "UNKNOWN", + "maintainer": null, + "maintainer_email": null, + "name": "7xydothis", + "package_url": "https://pypi.org/project/7xydothis/", + "platform": "UNKNOWN", + "project_url": "https://pypi.org/project/7xydothis/", + "project_urls": { + "Download": "UNKNOWN", + "Homepage": "https://github.com/frkb/7xydothis" + }, + "release_url": "https://pypi.org/project/7xydothis/0.1.1/", + "requires_dist": null, + "requires_python": null, + "summary": "Utitilies to use the 7xydothis APIs", + "version": "0.1.1" + }, + "last_serial": 2668125, + "releases": { + "0.1": [ + { + "comment_text": "", + "digests": { + "md5": "578e4bde98db732109d0698aba168a06", + "sha256": "7e6f59be532d43ac0ad32da6a068417f0973285a38a08f3f5056f79770f2f973" + }, + "downloads": -1, + "filename": "7xydothis-0.1.tar.gz", + "has_sig": false, + "md5_digest": "578e4bde98db732109d0698aba168a06", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 868, + "upload_time": "2017-02-25T21:31:02", + "url": "https://files.pythonhosted.org/packages/68/55/6a00e46a1a10e7a0731e50cbcc9f6243c5112eeda8326d781a03a1254105/7xydothis-0.1.tar.gz" + } + ], + "0.1.1": [ + { + "comment_text": "", + "digests": { + "md5": "75fe55b933330adbde027b6edc74863d", + "sha256": "76d243b70a10d51ea87312a97a7d7b1a525984fd56d1c5f41650a1fa0fde1bc1" + }, + "downloads": -1, + "filename": "7xydothis-0.1.1.tar.gz", + "has_sig": false, + "md5_digest": "75fe55b933330adbde027b6edc74863d", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 875, + "upload_time": "2017-02-25T21:41:37", + "url": "https://files.pythonhosted.org/packages/96/64/6fd8e189aa97820b306f06dbce02d618bf155379575c553db3d2c2eda045/7xydothis-0.1.1.tar.gz" + } + ] + }, + "urls": [ + { + "comment_text": "", + "digests": { + "md5": "75fe55b933330adbde027b6edc74863d", + "sha256": "76d243b70a10d51ea87312a97a7d7b1a525984fd56d1c5f41650a1fa0fde1bc1" + }, + "downloads": -1, + "filename": "7xydothis-0.1.1.tar.gz", + "has_sig": false, + "md5_digest": "75fe55b933330adbde027b6edc74863d", + "packagetype": "sdist", + "python_version": "source", + "requires_python": null, + "size": 875, + "upload_time": "2017-02-25T21:41:37", + "url": "https://files.pythonhosted.org/packages/96/64/6fd8e189aa97820b306f06dbce02d618bf155379575c553db3d2c2eda045/7xydothis-0.1.1.tar.gz" + } + ] +}