diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py index 7db9b70..081a55d 100644 --- a/swh/loader/package/pypi.py +++ b/swh/loader/package/pypi.py @@ -1,265 +1,265 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Generator, Dict, Tuple, Sequence -from urllib.parse import urljoin, urlparse +from urllib.parse import urlparse from pkginfo import UnpackedSDist import iso8601 import requests from swh.model.identifiers import normalize_timestamp from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.loader.package.loader import PackageLoader try: from swh.loader.core._version import __version__ except ImportError: __version__ = 'devel' DEFAULT_PARAMS = { 'headers': { 'User-Agent': 'Software Heritage Loader (%s)' % ( __version__ ) } } class PyPIClient: """PyPI api client. This deals with fetching json metadata about pypi projects. Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) api: - https://pypi.org/pypi/requests/json - https://pypi.org/pypi/requests/1.0.0/json (release description) """ def __init__(self, url): self.version = __version__ _url = urlparse(url) project_name = _url.path.split('/')[-1] self.url = '%s://%s/pypi/%s' % (_url.scheme, _url.netloc, project_name) self._session = None @property def session(self): if not self._session: self._session = requests.session() return self._session def _get(self, url: str) -> Dict: """Get query to the url. Args: url (str): Url Raises: ValueError in case of failing to query Returns: Response as dict if ok """ response = requests.get(url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response.json() def info_project(self) -> Dict: """Given a url, retrieve the raw json response Returns: Main project information as dict. """ return self._get('%s/json' % self.url) def info_release(self, release: str) -> Dict: """Given a release version, retrieve the raw information for such release Args: release: Release version Returns: Release information as dict """ return self._get('%s/%s/json' % (self.url, release)) def download(url: str, dest: str) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to Raises: ValueError in case of any error when fetching/computing Returns: Tuple of local (filepath, hashes of filepath) """ response = requests.get(url, **DEFAULT_PARAMS, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) length = int(response.headers['content-length']) filepath = os.path.join(dest, os.path.basename(url)) h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) actual_length = os.path.getsize(filepath) if length != actual_length: raise ValueError('Error when checking size: %s != %s' % ( length, actual_length)) # hashes = h.hexdigest() # actual_digest = hashes['sha256'] # if actual_digest != artifact['sha256']: # raise ValueError( # '%s %s: Checksum mismatched: %s != %s' % ( # project, version, artifact['sha256'], actual_digest)) return filepath, { 'length': length, **h.hexdigest() } def sdist_parse(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive project_dirname = os.listdir(dir_path)[0] pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') if not os.path.exists(pkginfo_path): return None pkginfo = UnpackedSDist(pkginfo_path) return pkginfo.__dict__ def author(data: Dict) -> Dict: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get('author') email = data.get('author_email') if email: fullname = '%s <%s>' % (name, email) else: fullname = name if not fullname: return {'fullname': b'', 'name': None, 'email': None} fullname = fullname.encode('utf-8') if name is not None: name = name.encode('utf-8') if email is not None: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} class PyPILoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'pypi' def __init__(self, url): super().__init__(url=url) self.client = PyPIClient(url) self._info = None @property def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ if not self._info: self._info = self.client.info_project() # dict return self._info def get_versions(self) -> Sequence[str]: return self.info['releases'].keys() def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: for meta in self.info['releases'][version]: yield meta['filename'], meta['url'], meta def fetch_artifact_archive( self, artifact_uri: str, dest: str) -> Tuple[str, Dict]: return download(artifact_uri, dest=dest) def build_revision(self, artifact_uncompressed_path: str) -> Dict: # Parse metadata (project, artifact metadata) metadata = sdist_parse(artifact_uncompressed_path) # Build revision name = metadata['version'].encode('utf-8') message = metadata['message'].encode('utf-8') message = b'%s: %s' % (name, message) if message else name _author = author(metadata) _date = normalize_timestamp( int(iso8601.parse_date(metadata['date']).timestamp())) return { 'name': name, 'message': message, 'author': _author, 'date': _date, 'committer': _author, 'committer_date': _date, 'parents': [], 'metadata': { 'intrinsic_metadata': metadata, } } diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py index a152242..dea3297 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -1,210 +1,209 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -import requests import pytest from swh.loader.package.pypi import PyPILoader, PyPIClient, author def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(): """Badly configured loader should raise""" assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini del os.environ['SWH_CONFIG_FILENAME'] with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypiclient_init(): """Initialization should set the api's base project url""" project_url = 'https://pypi.org/project/requests' expected_base_url = 'https://pypi.org/pypi/requests' pypi_client = PyPIClient(url=project_url) assert pypi_client.url == expected_base_url def test_pypiclient_failure(requests_mock): """Failure to fetch info/release information should raise""" project_url = 'https://pypi.org/project/requests' pypi_client = PyPIClient(url=project_url) expected_status_code = 400 info_url = '%s/json' % pypi_client.url requests_mock.get(info_url, status_code=expected_status_code) with pytest.raises(ValueError) as e0: pypi_client.info_project() assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( info_url, expected_status_code ) expected_status_code = 404 release_url = '%s/3.0.0/json' % pypi_client.url requests_mock.get(release_url, status_code=expected_status_code) with pytest.raises(ValueError) as e1: pypi_client.info_release("3.0.0") assert e1.value.args[0] == "Fail to query '%s'. Reason: %s" % ( release_url, expected_status_code ) def test_pypiclient(requests_mock): """Fetching info/release info should be ok""" pypi_client = PyPIClient('https://pypi.org/project/requests') info_url = '%s/json' % pypi_client.url requests_mock.get(info_url, text='{"version": "0.0.1"}') actual_info = pypi_client.info_project() assert actual_info == { 'version': '0.0.1', } release_url = '%s/2.0.0/json' % pypi_client.url requests_mock.get(release_url, text='{"version": "2.0.0"}') actual_release_info = pypi_client.info_release("2.0.0") assert actual_release_info == { 'version': '2.0.0', } # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} # problem during loading: # {visit: partial, status: uneventful, no snapshot} # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # # release artifact, no prior visit # {visit full, status eventful, snapshot} # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, new artifact # {visit full, status full, new snapshot with shared history as prior snapshot} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history}