diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 72f80f3..a77f473 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,295 +1,303 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os from typing import Generator, Dict, Tuple, Sequence, List from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes ) from swh.storage import get_storage from swh.loader.core.converters import content_for_storage logger = logging.getLogger(__name__) # Not implemented yet: # - clean up disk routines from previous killed workers (when OOMkilled) # -> separation of concern would like this to be abstracted from the code # -> experience tells us it's complicated to do as such (T903, T964, T982, # etc...) # # - splitting into groups too many objects sent to storage > could be a > # -> specialized collaborator or storage implementation or proxy which deals # with this # # - model: swh.model.merkle.from_disk should output swh.model.model.* objects # to avoid this layer's conversion routine call # -> Take this up within swh.model's current implementation # # - Does not trap exceptions yet within the PackageLoader.load method class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: """Given a release version of a package, retrieve the associated artifact information for such version. Args: version: Package version Returns: (artifact filename, artifact uri, raw artifact metadata) """ yield from {} def fetch_artifact_archive( self, artifact_archive_path: str, dest: str) -> Tuple[str, Dict]: """Fetch artifact archive to a temporary folder and returns its path. Args: artifact_archive_path: Path to artifact archive to uncompress dest: Directory to write the downloaded archive to Returns: the locally retrieved artifact path """ return '', {} def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: """Build the revision dict Returns: SWH data dict """ return {} def get_default_release(self) -> str: """Retrieve the latest release version Returns: Latest version """ return '' def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed - status_visit = 'partial' # either: partial, full + status_visit = 'full' # either: partial, full tmp_revisions: Dict[str, List] = {} + snapshot = None try: # Prepare origin and origin_visit origin = {'url': self.url} self.storage.origin_add([origin]) visit_date = datetime.datetime.now(tz=datetime.timezone.utc) visit_id = self.storage.origin_visit_add( origin=self.url, date=visit_date, type=self.visit_type)['visit'] # Retrieve the default release (the "latest" one) default_release = self.get_default_release() logger.debug('default release: %s', default_release) # FIXME: Add load exceptions handling for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `a_` stands for `artifact_` for a_filename, a_uri, a_metadata in self.get_artifacts( version): with tempfile.TemporaryDirectory() as tmpdir: - # a_c_: archive_computed_ - a_path, a_c_metadata = self.fetch_artifact_archive( - a_uri, dest=tmpdir) + try: + # a_c_: archive_computed_ + a_path, a_c_metadata = self.fetch_artifact_archive( + a_uri, dest=tmpdir) + except Exception as e: + logger.warning('Unable to retrieve %s. Reason: %s', + a_uri, e) + status_visit = 'partial' + continue logger.debug('archive_path: %s', a_path) logger.debug('archive_computed_metadata: %s', a_c_metadata) uncompressed_path = os.path.join(tmpdir, 'src') uncompress(a_path, dest=uncompressed_path) logger.debug('uncompressed_path: %s', uncompressed_path) directory = Directory.from_disk( path=uncompressed_path.encode('utf-8'), data=True) # FIXME: Try not to load the full raw content in memory objects = directory.collect() contents = objects['content'].values() logger.debug('Number of contents: %s', len(contents)) self.storage.content_add( map(content_for_storage, contents)) status_load = 'eventful' directories = objects['directory'].values() logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 discussion revision = self.build_revision( a_metadata, uncompressed_path) revision.update({ 'type': 'tar', 'synthetic': True, 'directory': directory.hash, }) revision['metadata'].update({ 'original_artifact': a_metadata, 'hashes_artifact': a_c_metadata }) revision['id'] = identifier_to_bytes( revision_identifier(revision)) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) tmp_revisions[version].append({ 'filename': a_filename, 'target': revision['id'], }) # Build and load the snapshot branches = {} for version, v_branches in tmp_revisions.items(): if len(v_branches) == 1: branch_name = ('releases/%s' % version).encode('utf-8') if version == default_release: branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name, } branches[branch_name] = { 'target_type': 'revision', 'target': v_branches[0]['target'], } else: for x in v_branches: branch_name = ('releases/%s/%s' % ( version, v_branches['filename'])).encode('utf-8') branches[branch_name] = { 'target_type': 'revision', 'target': x['target'], } - snapshot = { - 'branches': branches - } - snapshot['id'] = identifier_to_bytes( - snapshot_identifier(snapshot)) - self.storage.snapshot_add([snapshot]) + if branches: + snapshot = { + 'branches': branches + } + snapshot['id'] = identifier_to_bytes( + snapshot_identifier(snapshot)) - # come so far, we actually reached a full visit - status_visit = 'full' + logger.debug('snapshot: %s', snapshot) + self.storage.snapshot_add([snapshot]) # Update the visit's state self.storage.origin_visit_update( origin=self.url, visit_id=visit_id, status=status_visit, snapshot=snapshot) except Exception as e: logger.warning('Fail to load %s. Reason: %s' % (self.url, e)) + status_visit = 'partial' finally: return {'status': status_load} diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py index 7c359b4..aff0231 100644 --- a/swh/loader/package/tests/conftest.py +++ b/swh/loader/package/tests/conftest.py @@ -1,99 +1,101 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import re import pytest from functools import partial from os import path from urllib.parse import urlparse from .common import DATADIR logger = logging.getLogger(__name__) @pytest.fixture def swh_config(monkeypatch): conffile = os.path.join(DATADIR, 'loader.yml') monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile def get_response_cb(request, context, ignore_urls=[]): """Mount point callback to fetch on disk the content of a request This is meant to be used as 'body' argument of the requests_mock.get() method. It will look for files on the local filesystem based on the requested URL, using the following rules: - files are searched in the DATADIR/ directory - the local file name is the path part of the URL with path hierarchy markers (aka '/') replaced by '_' Eg. if you use the requests_mock fixture in your test file as: requests_mock.get('https://nowhere.com', body=get_response_cb) # or even requests_mock.get(re.compile('https://'), body=get_response_cb) then a call requests.get like: requests.get('https://nowhere.com/path/to/resource') will look the content of the response in: DATADIR/resources/nowhere.com/path_to_resource Args: request (requests.Request): Object requests context (requests.Context): Object holding response metadata information (status_code, headers, etc...) ignore_urls (List): urls whose status response should be 404 even if the local file exists Returns: Optional[FileDescriptor] on the on disk file to read from the test context """ logger.debug('get_response_cb(%s, %s)', request, context) - url = urlparse(request.url) - if url in ignore_urls: + logger.debug('url: %s', request.url) + logger.debug('ignore_urls: %s', ignore_urls) + if request.url in ignore_urls: context.status_code = 404 return None + url = urlparse(request.url) dirname = url.hostname # pypi.org | files.pythonhosted.org # url.path: pypi//json -> local file: pypi__json filename = url.path[1:] if filename.endswith('/'): filename = filename[:-1] filename = filename.replace('/', '_') filepath = path.join(DATADIR, dirname, filename) if not path.isfile(filepath): context.status_code = 404 return None fd = open(filepath, 'rb') context.headers['content-length'] = str(path.getsize(filepath)) return fd def local_get_factory(ignore_urls=[]): @pytest.fixture def local_get(requests_mock): cb = partial(get_response_cb, ignore_urls=ignore_urls) requests_mock.get(re.compile('https://'), body=cb) return requests_mock return local_get local_get = local_get_factory([]) diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py index 8dd6bf6..6adbae2 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -1,304 +1,378 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from os import path import pytest from swh.core.tarball import uncompress from swh.model.hashutil import hash_to_bytes from swh.loader.package.pypi import ( PyPILoader, pypi_api_url, pypi_info, author, sdist_parse ) from swh.loader.package.tests.common import DATADIR, check_snapshot +from swh.loader.package.tests.conftest import local_get_factory + def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_info_failure(requests_mock): """Failure to fetch info/release information should raise""" project_url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(info_url, status_code=status_code) with pytest.raises(ValueError) as e0: pypi_info(project_url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( info_url, status_code ) def test_pypi_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/project/requests' info_url = 'https://pypi.org/pypi/requests/json' requests_mock.get(info_url, text='{"version": "0.0.1"}') actual_info = pypi_info(url) assert actual_info == { 'version': '0.0.1', } @pytest.mark.fs def test_sdist_parse(tmp_path): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_sdist = sdist_parse(uncompressed_archive_path) expected_sdist = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_sdist == expected_sdist @pytest.mark.fs def test_sdist_parse_failures(tmp_path): """Parsing inexistant path/archive/PKG-INFO yield None""" # inexistant first level path assert sdist_parse('/something-inexistant') == {} # inexistant second level path (as expected by pypi archives) assert sdist_parse(tmp_path) == {} # inexistant PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert sdist_parse(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # +# no release artifact: +# {visit full, status: uneventful, no contents, etc...} def test_no_release_artifact(requests_mock): pass -# no release artifact: -# {visit full, status: uneventful, no contents, etc...} - # problem during loading: # {visit: partial, status: uneventful, no snapshot} + + + # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # + +local_get_missing = local_get_factory(ignore_urls=[ + 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa +]) + +# some missing release artifacts: +# {visit partial, status: eventful, 1 snapshot} + +def test_release_with_missing_artifact(swh_config, local_get_missing): + """Load a pypi project with some missing artifacts ends up with 1 snapshot + + """ + loader = PyPILoader('https://pypi.org/project/0805nexter') + + actual_load_status = loader.load() + + assert actual_load_status == {'status': 'eventful'} + + stats = loader.storage.stat_counters() + assert { + 'content': 3, + 'directory': 2, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1 + } == stats + + expected_contents = map(hash_to_bytes, [ + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ]) + + assert list(loader.storage.content_missing_per_sha1(expected_contents))\ + == [] + + expected_dirs = map(hash_to_bytes, [ + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ]) + + assert list(loader.storage.directory_missing(expected_dirs)) == [] + + # {revision hash: directory hash} + expected_revs = { + hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa + } + assert list(loader.storage.revision_missing(expected_revs)) == [] + + expected_branches = { + 'releases/1.2.0': { + 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', + 'target_type': 'revision', + }, + 'HEAD': { + 'target': 'releases/1.2.0', + 'target_type': 'alias', + }, + } + + check_snapshot( + 'dd0e4201a232b1c104433741dbf45895b8ac9355', + expected_branches, + storage=loader.storage) + + def test_release_artifact_no_prior_visit(swh_config, local_get): """With no prior visit, load a pypi project ends up with 1 snapshot """ loader = PyPILoader('https://pypi.org/project/0805nexter') actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } check_snapshot( 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a', expected_branches, storage=loader.storage) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, new artifact # {visit full, status full, new snapshot with shared history as prior snapshot} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history}