diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 91789e6..d592e0d 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,302 +1,370 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os -from typing import Generator, Dict, Tuple, Sequence, List +from typing import Generator, Dict, Tuple, Sequence, List, Optional from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes ) from swh.storage import get_storage +from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.core.converters import content_for_storage from swh.loader.package.utils import download logger = logging.getLogger(__name__) # Not implemented yet: # - clean up disk routines from previous killed workers (when OOMkilled) # -> separation of concern would like this to be abstracted from the code # -> experience tells us it's complicated to do as such (T903, T964, T982, # etc...) # # - model: swh.model.merkle.from_disk should output swh.model.model.* objects # to avoid this layer's conversion routine call # -> Take this up within swh.model's current implementation # # - Incremental loading through latest snapshot introspection class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: """Given a release version of a package, retrieve the associated artifact information for such version. Args: version: Package version Returns: (artifact filename, artifact uri, raw artifact metadata) """ yield from {} def fetch_artifact_archive( self, artifact_uri: str, dest: str) -> Tuple[str, Dict]: """Fetch artifact archive to a temporary folder and returns its path. Args: artifact_uri: Artifact uri to fetch dest: Directory to write the downloaded archive to Returns: the locally retrieved artifact path """ return download(artifact_uri, dest=dest) def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: """Build the revision dict Returns: SWH data dict """ return {} def get_default_release(self) -> str: """Retrieve the latest release version Returns: Latest version """ return '' + def last_snapshot(self) -> Optional[Dict]: + """Retrieve the last snapshot + + """ + visit = self.storage.origin_visit_get_latest( + self.url, require_snapshot=True) + if visit: + return snapshot_get_all_branches( + self.storage, visit['snapshot']['id']) + + def known_artifacts(self, snapshot: Dict) -> [Dict]: + """Retrieve the known releases/artifact for the origin. + + Args + snapshot: snapshot for the visit + + Returns: + Dict of keys revision id (bytes), values a metadata Dict. + + """ + if not snapshot or 'branches' not in snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + + ret = {} + for revision in known_revisions: + if not revision: # revision_get can return None + continue + original_artifact = revision['metadata'].get('original_artifact') + if original_artifact: + ret[revision['id']] = original_artifact + + return ret + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + """Resolve the revision from a snapshot and an artifact metadata dict. + + If the artifact has already been downloaded, this will return the + existing revision targeting that uncompressed artifact directory. + Otherwise, this returns None. + + Args: + snapshot: Snapshot + artifact_metadata: Information dict + + Returns: + None or revision identifier + + """ + return None + def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'full' # either: partial, full tmp_revisions: Dict[str, List] = {} snapshot = None try: # Prepare origin and origin_visit origin = {'url': self.url} self.storage.origin_add([origin]) visit_date = datetime.datetime.now(tz=datetime.timezone.utc) visit_id = self.storage.origin_visit_add( origin=self.url, date=visit_date, type=self.visit_type)['visit'] + last_snapshot = self.last_snapshot() + logger.debug('last snapshot: %s', last_snapshot) + known_artifacts = self.known_artifacts(last_snapshot) + logger.debug('known artifacts: %s', known_artifacts) # Retrieve the default release (the "latest" one) default_release = self.get_default_release() logger.debug('default release: %s', default_release) for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `a_` stands for `artifact_` for a_filename, a_uri, a_metadata in self.get_artifacts( version): - with tempfile.TemporaryDirectory() as tmpdir: - try: - # a_c_: archive_computed_ - a_path, a_c_metadata = self.fetch_artifact_archive( - a_uri, dest=tmpdir) - except Exception as e: - logger.warning('Unable to retrieve %s. Reason: %s', - a_uri, e) - status_visit = 'partial' - continue - - logger.debug('archive_path: %s', a_path) - logger.debug('archive_computed_metadata: %s', - a_c_metadata) - - uncompressed_path = os.path.join(tmpdir, 'src') - uncompress(a_path, dest=uncompressed_path) - - logger.debug('uncompressed_path: %s', - uncompressed_path) - - directory = Directory.from_disk( - path=uncompressed_path.encode('utf-8'), data=True) - # FIXME: Try not to load the full raw content in memory - objects = directory.collect() - - contents = objects['content'].values() - logger.debug('Number of contents: %s', - len(contents)) - - self.storage.content_add( - map(content_for_storage, contents)) - - status_load = 'eventful' - directories = objects['directory'].values() - - logger.debug('Number of directories: %s', - len(directories)) - - self.storage.directory_add(directories) - - # FIXME: This should be release. cf. D409 discussion - revision = self.build_revision( - a_metadata, uncompressed_path) - revision.update({ - 'type': 'tar', - 'synthetic': True, - 'directory': directory.hash, - }) + revision_id = self.resolve_revision_from( + known_artifacts, a_metadata) + if revision_id is None: + with tempfile.TemporaryDirectory() as tmpdir: + try: + # a_c_: archive_computed_ + a_path, a_c_metadata = self.fetch_artifact_archive( # noqa + a_uri, dest=tmpdir) + except Exception as e: + logger.warning( + 'Unable to retrieve %s. Reason: %s', + a_uri, e) + status_visit = 'partial' + continue + + logger.debug('archive_path: %s', a_path) + logger.debug('archive_computed_metadata: %s', + a_c_metadata) + + uncompressed_path = os.path.join(tmpdir, 'src') + uncompress(a_path, dest=uncompressed_path) + + logger.debug('uncompressed_path: %s', + uncompressed_path) + + directory = Directory.from_disk( + path=uncompressed_path.encode('utf-8'), data=True) # noqa + # FIXME: Try not to load the full raw content in + # memory + objects = directory.collect() + + contents = objects['content'].values() + logger.debug('Number of contents: %s', + len(contents)) + + self.storage.content_add( + map(content_for_storage, contents)) + + status_load = 'eventful' + directories = objects['directory'].values() + + logger.debug('Number of directories: %s', + len(directories)) + + self.storage.directory_add(directories) + + # FIXME: This should be release. cf. D409 + revision = self.build_revision( + a_metadata, uncompressed_path) + revision.update({ + 'type': 'tar', + 'synthetic': True, + 'directory': directory.hash, + }) # FIXME: Standardize those metadata keys and use the # correct ones revision['metadata'].update({ 'original_artifact': a_metadata, 'hashes_artifact': a_c_metadata }) - revision['id'] = identifier_to_bytes( + revision['id'] = revision_id = identifier_to_bytes( revision_identifier(revision)) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) - tmp_revisions[version].append({ - 'filename': a_filename, - 'target': revision['id'], - }) + tmp_revisions[version].append({ + 'filename': a_filename, + 'target': revision_id, + }) # Build and load the snapshot branches = {} for version, v_branches in tmp_revisions.items(): if len(v_branches) == 1: branch_name = ('releases/%s' % version).encode('utf-8') if version == default_release: branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name, } branches[branch_name] = { 'target_type': 'revision', 'target': v_branches[0]['target'], } else: for x in v_branches: branch_name = ('releases/%s/%s' % ( version, v_branches['filename'])).encode('utf-8') branches[branch_name] = { 'target_type': 'revision', 'target': x['target'], } snapshot = { 'branches': branches } snapshot['id'] = identifier_to_bytes( snapshot_identifier(snapshot)) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) if hasattr(self.storage, 'flush'): self.storage.flush() except Exception as e: logger.warning('Fail to load %s. Reason: %s' % (self.url, e)) status_visit = 'partial' finally: self.storage.origin_visit_update( origin=self.url, visit_id=visit_id, status=status_visit, snapshot=snapshot) return {'status': status_load} diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py index 627b172..da92a95 100644 --- a/swh/loader/package/pypi.py +++ b/swh/loader/package/pypi.py @@ -1,161 +1,169 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -from typing import Generator, Dict, Tuple, Sequence +from typing import Generator, Dict, Tuple, Sequence, Optional from urllib.parse import urlparse from pkginfo import UnpackedSDist import iso8601 from swh.model.identifiers import normalize_timestamp from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import api_info def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.split('/')[-1] url = '%s://%s/pypi/%s/json' % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO') if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop('filename') # this gets added with the ondisk location return raw def author(data: Dict) -> Dict: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get('author') email = data.get('author_email') if email: fullname = '%s <%s>' % (name, email) else: fullname = name if not fullname: return {'fullname': b'', 'name': None, 'email': None} fullname = fullname.encode('utf-8') if name is not None: name = name.encode('utf-8') if email is not None: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} class PyPILoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'pypi' def __init__(self, url): super().__init__(url=url) self._info = None @property def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ if not self._info: self._info = api_info(pypi_api_url(self.url)) return self._info def get_versions(self) -> Sequence[str]: return self.info['releases'].keys() def get_default_release(self) -> str: return self.info['info']['version'] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: for meta in self.info['releases'][version]: yield meta['filename'], meta['url'], meta + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + sha256 = artifact_metadata['digests']['sha256'] + for rev_id, known_artifact in known_artifacts.items(): + if sha256 == known_artifact['digests']['sha256']: + return rev_id + def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: # Parse metadata (project, artifact metadata) metadata = extract_intrinsic_metadata(a_uncompressed_path) # from intrinsic metadata name = metadata['version'] _author = author(metadata) # from extrinsic metadata message = a_metadata.get('comment_text', '') message = '%s: %s' % (name, message) if message else name date = normalize_timestamp( int(iso8601.parse_date(a_metadata['upload_time']).timestamp())) return { 'message': message.encode('utf-8'), 'author': _author, 'date': date, 'committer': _author, 'committer_date': date, 'parents': [], 'metadata': { 'intrinsic_metadata': metadata, } } diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py index ec9e9a5..8088ce1 100644 --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -1,533 +1,541 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from os import path import pytest from unittest.mock import patch from swh.core.tarball import uncompress from swh.model.hashutil import hash_to_bytes from swh.loader.package.pypi import ( PyPILoader, pypi_api_url, author, extract_intrinsic_metadata ) from swh.loader.package.tests.common import ( check_snapshot, DATADIR ) from swh.loader.package.tests.conftest import local_get_factory def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot', 'name': b'i-am-groot', 'email': b'', } assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b' ', 'name': b'', 'email': b'iam@groot.org', } assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_sdist = extract_intrinsic_metadata(uncompressed_archive_path) expected_sdist = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_sdist == expected_sdist @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistant path/archive/PKG-INFO yield None""" # inexistant first level path assert extract_intrinsic_metadata('/something-inexistant') == {} # inexistant second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistant PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} local_get_missing_all = local_get_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa ]) def test_no_release_artifact(swh_config, local_get_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'uneventful'} stats = loader.storage.stat_counters() assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' # problem during loading: # {visit: partial, status: uneventful, no snapshot} def test_release_with_traceback(swh_config): url = 'https://pypi.org/project/0805nexter' with patch('swh.loader.package.pypi.PyPILoader.get_default_release', side_effect=ValueError('Problem')): loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'uneventful'} stats = loader.storage.stat_counters() assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # local_get_missing_one = local_get_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa ]) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_release_with_missing_artifact(swh_config, local_get_missing_one): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 3, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': 'dd0e4201a232b1c104433741dbf45895b8ac9355', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' def test_release_artifact_no_prior_visit(swh_config, local_get): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a', 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' # release artifact, new artifact # {visit full, status full, new snapshot with shared history as prior snapshot} -def test_release_artifact_with_2_visits(swh_config, local_get_visits): +def test_release_artifact_incremental_visit(swh_config, local_get_visits): """With prior visit, 2nd load will result with a different snapshot with some shared history """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) visit1_actual_load_status = loader.load() visit1_stats = loader.storage.stat_counters() assert visit1_actual_load_status == {'status': 'eventful'} origin_visit1 = next(loader.storage.origin_visit_get(url)) assert origin_visit1['status'] == 'full' assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == visit1_stats # Reset internal state loader._info = None visit2_actual_load_status = loader.load() visit2_stats = loader.storage.stat_counters() assert visit2_actual_load_status == {'status': 'eventful'} visits = list(loader.storage.origin_visit_get(url)) assert len(visits) == 2 assert visits[1]['status'] == 'full' assert { 'content': 6 + 1, # 1 more content 'directory': 4 + 2, # 2 more directories 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': 2 + 1, # 1 more revision 'skipped_content': 0, 'snapshot': 1 + 1, # 1 more snapshot } == visit2_stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', '92689fa2b7fb4d4fc6fb195bf73a50c87c030639' ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', '52604d46843b898f5a43208045d09fcf8731631b', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'releases/1.3.0': { 'target': '51247143b01445c9348afa9edfae31bf7c5d86b1', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.3.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': '2e5149a7b0725d18231a37b342e9b7c4e121f283', 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' + urls = [ + m.url for m in local_get_visits.request_history + if m.url.startswith('https://files.pythonhosted.org') + ] + # visited each artifact once across 2 visits + assert len(urls) == len(set(urls)) + + # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history}