diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py index a34d2f4..f92bff9 100644 --- a/swh/loader/package/pypi.py +++ b/swh/loader/package/pypi.py @@ -1,251 +1,191 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Generator, Dict, Tuple, Sequence from urllib.parse import urlparse from pkginfo import UnpackedSDist import iso8601 import requests from swh.model.identifiers import normalize_timestamp -from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.loader.package import DEFAULT_PARAMS from swh.loader.package.loader import PackageLoader - -try: - from swh.loader.core._version import __version__ -except ImportError: - __version__ = 'devel' - - -DEFAULT_PARAMS = { - 'headers': { - 'User-Agent': 'Software Heritage Loader (%s)' % ( - __version__ - ) - } -} +from swh.loader.package.utils import download def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.split('/')[-1] url = '%s://%s/pypi/%s/json' % (p_url.scheme, p_url.netloc, project_name) return url def pypi_info(url: str) -> Dict: """PyPI api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Raises: ValueError in case of query failures (for some reasons: 404, ...) Returns: PyPI's information dict """ api_url = pypi_api_url(url) response = requests.get(api_url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( api_url, response.status_code)) return response.json() -def download(url: str, dest: str) -> Tuple[str, Dict]: - """Download a remote tarball from url, uncompresses and computes swh hashes - on it. - - Args: - url: Artifact uri to fetch, uncompress and hash - dest: Directory to write the archive to - - Raises: - ValueError in case of any error when fetching/computing - - Returns: - Tuple of local (filepath, hashes of filepath) - - """ - response = requests.get(url, **DEFAULT_PARAMS, stream=True) - if response.status_code != 200: - raise ValueError("Fail to query '%s'. Reason: %s" % ( - url, response.status_code)) - length = int(response.headers['content-length']) - - filepath = os.path.join(dest, os.path.basename(url)) - - h = MultiHash(length=length) - with open(filepath, 'wb') as f: - for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): - h.update(chunk) - f.write(chunk) - - actual_length = os.path.getsize(filepath) - if length != actual_length: - raise ValueError('Error when checking size: %s != %s' % ( - length, actual_length)) - - # hashes = h.hexdigest() - # actual_digest = hashes['sha256'] - # if actual_digest != artifact['sha256']: - # raise ValueError( - # '%s %s: Checksum mismatched: %s != %s' % ( - # project, version, artifact['sha256'], actual_digest)) - - return filepath, { - 'length': length, - **h.hexdigest() - } - - def sdist_parse(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return None lst = os.listdir(dir_path) if len(lst) == 0: return None project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') if not os.path.exists(pkginfo_path): return None pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop('filename') # this gets added with the ondisk location return raw def author(data: Dict) -> Dict: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get('author') email = data.get('author_email') if email: fullname = '%s <%s>' % (name, email) else: fullname = name if not fullname: return {'fullname': b'', 'name': None, 'email': None} fullname = fullname.encode('utf-8') if name is not None: name = name.encode('utf-8') if email is not None: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} class PyPILoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'pypi' def __init__(self, url): super().__init__(url=url) self._info = None @property def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ if not self._info: self._info = pypi_info(self.url) return self._info def get_versions(self) -> Sequence[str]: return self.info['releases'].keys() def get_default_release(self) -> str: return self.info['info']['version'] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: for meta in self.info['releases'][version]: yield meta['filename'], meta['url'], meta def fetch_artifact_archive( self, artifact_uri: str, dest: str) -> Tuple[str, Dict]: return download(artifact_uri, dest=dest) def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: # Parse metadata (project, artifact metadata) metadata = sdist_parse(a_uncompressed_path) # from intrinsic metadata name = metadata['version'] _author = author(metadata) # from extrinsic metadata message = a_metadata.get('comment_text', '') message = '%s: %s' % (name, message) if message else name date = normalize_timestamp( int(iso8601.parse_date(a_metadata['upload_time']).timestamp())) return { 'message': message.encode('utf-8'), 'author': _author, 'date': date, 'committer': _author, 'committer_date': date, 'parents': [], 'metadata': { 'intrinsic_metadata': metadata, } } diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py index 6b20fa7..7dd1c67 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -1,433 +1,371 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re from os import path from urllib.parse import urlparse import pytest from swh.core.tarball import uncompress from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.loader.package.pypi import ( - PyPILoader, pypi_api_url, pypi_info, author, sdist_parse, download + PyPILoader, pypi_api_url, pypi_info, author, sdist_parse ) DATADIR = path.join(path.abspath(path.dirname(__file__)), 'resources') def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME') with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_info_failure(requests_mock): """Failure to fetch info/release information should raise""" project_url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(info_url, status_code=status_code) with pytest.raises(ValueError) as e0: pypi_info(project_url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( info_url, status_code ) def test_pypi_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' requests_mock.get(info_url, text='{"version": "0.0.1"}') actual_info = pypi_info(url) assert actual_info == { 'version': '0.0.1', } @pytest.mark.fs def test_sdist_parse(tmp_path): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_sdist = sdist_parse(uncompressed_archive_path) expected_sdist = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_sdist == expected_sdist @pytest.mark.fs def test_sdist_parse_failures(tmp_path): """Parsing inexistant path/archive/PKG-INFO yield None""" # inexistant first level path assert sdist_parse('/something-inexistant') is None # inexistant second level path (as expected by pypi archives) assert sdist_parse(tmp_path) is None # inexistant PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert sdist_parse(tmp_path) is None -@pytest.mark.fs -def test_download_fail_to_download(tmp_path, requests_mock): - url = 'https://pypi.org/pypi/arrow/json' - status_code = 404 - requests_mock.get(url, status_code=status_code) - - with pytest.raises(ValueError) as e: - download(url, tmp_path) - - assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( - url, status_code) - - -@pytest.mark.fs -def test_download_fail_length_mismatch(tmp_path, requests_mock): - """Mismatch length after download should raise - - """ - filename = 'requests-0.0.1.tar.gz' - url = 'https://pypi.org/pypi/requests/%s' % filename - data = 'this is something' - wrong_size = len(data) - 3 - requests_mock.get(url, text=data, headers={ - 'content-length': str(wrong_size) # wrong size! - }) - - with pytest.raises(ValueError) as e: - download(url, dest=str(tmp_path)) - - assert e.value.args[0] == "Error when checking size: %s != %s" % ( - wrong_size, len(data) - ) - - -@pytest.mark.fs -def test_download_ok(tmp_path, requests_mock): - """Download without issue should provide filename and hashes""" - filename = 'requests-0.0.1.tar.gz' - url = 'https://pypi.org/pypi/requests/%s' % filename - data = 'this is something' - requests_mock.get(url, text=data, headers={ - 'content-length': str(len(data)) - }) - - actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) - - actual_filename = os.path.basename(actual_filepath) - assert actual_filename == filename - assert actual_hashes['length'] == len(data) - assert actual_hashes['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' - assert (actual_hashes['sha256'] == - '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') - - -@pytest.mark.fs -def test_download_fail_hashes_mismatch(tmp_path, requests_mock): - """Mismatch hash after download should raise - - """ - pass - - # LOADER SCENARIO # def get_response_cb(request, context): """""" url = urlparse(request.url) dirname = url.hostname # pypi.org | files.pythonhosted.org # url.path: pypi//json -> local file: pypi__json filename = url.path[1:].replace('/', '_') filepath = path.join(DATADIR, dirname, filename) fd = open(filepath, 'rb') context.headers['content-length'] = str(os.path.getsize(filepath)) return fd # "edge" cases (for the same origin) # def test_no_release_artifact(requests_mock): pass # no release artifact: # {visit full, status: uneventful, no contents, etc...} # problem during loading: # {visit: partial, status: uneventful, no snapshot} # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # def test_release_artifact_no_prior_visit(requests_mock): """With no prior visit, load a pypi project ends up with 1 snapshot """ assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini loader = PyPILoader('https://pypi.org/project/0805nexter') requests_mock.get(re.compile('https://'), body=get_response_cb) actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } check_snapshot( 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a', expected_branches, storage=loader.storage) # self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) # self.assertEqual(self.loader.visit_status(), 'full') # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, new artifact # {visit full, status full, new snapshot with shared history as prior snapshot} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} def decode_target(target): if not target: return target target_type = target['target_type'] if target_type == 'alias': decoded_target = target['target'].decode('utf-8') else: decoded_target = hash_to_hex(target['target']) return { 'target': decoded_target, 'target_type': target_type } def check_snapshot(expected_snapshot, expected_branches, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (Union[str, dict]): Either the snapshot identifier or the full snapshot expected_branches ([dict]): expected branches or nothing is the full snapshot is provided """ if isinstance(expected_snapshot, dict) and not expected_branches: expected_snapshot_id = expected_snapshot['id'] expected_branches = expected_snapshot['branches'] else: expected_snapshot_id = expected_snapshot snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) assert snap is not None branches = { branch.decode('utf-8'): decode_target(target) for branch, target in snap['branches'].items() } assert expected_branches == branches diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py new file mode 100644 index 0000000..05e0596 --- /dev/null +++ b/swh/loader/package/tests/test_utils.py @@ -0,0 +1,72 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import os +import pytest + +from swh.loader.package.utils import download + + +@pytest.mark.fs +def test_download_fail_to_download(tmp_path, requests_mock): + url = 'https://pypi.org/pypi/arrow/json' + status_code = 404 + requests_mock.get(url, status_code=status_code) + + with pytest.raises(ValueError) as e: + download(url, tmp_path) + + assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( + url, status_code) + + +@pytest.mark.fs +def test_download_fail_length_mismatch(tmp_path, requests_mock): + """Mismatch length after download should raise + + """ + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + wrong_size = len(data) - 3 + requests_mock.get(url, text=data, headers={ + 'content-length': str(wrong_size) # wrong size! + }) + + with pytest.raises(ValueError) as e: + download(url, dest=str(tmp_path)) + + assert e.value.args[0] == "Error when checking size: %s != %s" % ( + wrong_size, len(data) + ) + + +@pytest.mark.fs +def test_download_ok(tmp_path, requests_mock): + """Download without issue should provide filename and hashes""" + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) + + actual_filename = os.path.basename(actual_filepath) + assert actual_filename == filename + assert actual_hashes['length'] == len(data) + assert actual_hashes['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' + assert (actual_hashes['sha256'] == + '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') + + +@pytest.mark.fs +def test_download_fail_hashes_mismatch(tmp_path, requests_mock): + """Mismatch hash after download should raise + + """ + pass diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py new file mode 100644 index 0000000..d8d4e5b --- /dev/null +++ b/swh/loader/package/utils.py @@ -0,0 +1,59 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import requests + +from typing import Dict, Tuple + +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.loader.package import DEFAULT_PARAMS + + +def download(url: str, dest: str) -> Tuple[str, Dict]: + """Download a remote tarball from url, uncompresses and computes swh hashes + on it. + + Args: + url: Artifact uri to fetch, uncompress and hash + dest: Directory to write the archive to + + Raises: + ValueError in case of any error when fetching/computing + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + response = requests.get(url, **DEFAULT_PARAMS, stream=True) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + length = int(response.headers['content-length']) + + filepath = os.path.join(dest, os.path.basename(url)) + + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + # hashes = h.hexdigest() + # actual_digest = hashes['sha256'] + # if actual_digest != artifact['sha256']: + # raise ValueError( + # '%s %s: Checksum mismatched: %s != %s' % ( + # project, version, artifact['sha256'], actual_digest)) + + return filepath, { + 'length': length, + **h.hexdigest() + }