diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py index 3177ad4..1fe1752 100644 --- a/swh/loader/package/deposit.py +++ b/swh/loader/package/deposit.py @@ -1,154 +1,154 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Generator, Mapping, Sequence, Tuple -from swh.model.hashutil import hash_to_hex +from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.loader.package.loader import PackageLoader from swh.deposit.client import PrivateApiDepositClient as ApiClient logger = logging.getLogger(__name__) class DepositLoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'deposit' def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) # For now build back existing api urls # archive_url: Private api url to retrieve archive artifact self.archive_url = '/%s/raw/' % deposit_id # metadata_url: Private api url to retrieve the deposit metadata self.metadata_url = '/%s/meta/' % deposit_id # deposit_update_url: Private api to push pids and status update on the # deposit id self.deposit_update_url = '/%s/update/' % deposit_id self.client = ApiClient() self._metadata = None @property def metadata(self): if self._metadata is None: self._metadata = self.client.metadata_get(self.metadata_url) return self._metadata def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ['HEAD'] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: p_info = { 'url': self.client.base_url + self.archive_url, 'filename': 'archive.zip', 'raw': self.metadata, } yield 'HEAD', p_info def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: revision = a_metadata.pop('revision') metadata = { 'extrinsic': { 'provider': '%s/%s' % ( self.client.base_url, self.metadata_url), 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, } # FIXME: the deposit no longer needs to build the revision revision['metadata'].update(metadata) revision['author'] = parse_author(revision['author']) revision['committer'] = parse_author(revision['committer']) revision['message'] = revision['message'].encode('utf-8') revision['type'] = 'tar' return revision def load(self) -> Dict: # Usual loading r = super().load() success = r['status'] != 'failed' if success: # Update archive with metadata information origin_metadata = self.metadata['origin_metadata'] logger.debug('origin_metadata: %s', origin_metadata) tools = self.storage.tool_add([origin_metadata['tool']]) logger.debug('tools: %s', tools) tool_id = tools[0]['id'] provider = origin_metadata['provider'] # FIXME: Shall we delete this info? provider_id = self.storage.metadata_provider_add( provider['provider_name'], provider['provider_type'], provider['provider_url'], metadata=None) metadata = origin_metadata['metadata'] self.storage.origin_metadata_add( self.url, self.visit_date, provider_id, tool_id, metadata) # Update deposit status try: if not success: self.client.status_update( self.deposit_update_url, status='failed') return r - snapshot_id = r['snapshot_id'] + snapshot_id = hash_to_bytes(r['snapshot_id']) branches = self.storage.snapshot_get(snapshot_id)['branches'] logger.debug('branches: %s', branches) if not branches: return r rev_id = branches[b'HEAD']['target'] revision = next(self.storage.revision_get([rev_id])) # Retrieve the revision identifier dir_id = revision['directory'] # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_update_url, status='done', revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), origin_url=self.url) except Exception: logger.exception( 'Problem when trying to update the deposit\'s status') return {'status': 'failed'} return r def parse_author(author): """See prior fixme """ return { 'fullname': author['fullname'].encode('utf-8'), 'name': author['name'].encode('utf-8'), 'email': author['email'].encode('utf-8'), } diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 4c7d462..f53db3a 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,394 +1,395 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os from typing import ( Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple ) from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model.from_disk import Directory +from swh.model.hashutil import hash_to_hex from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes ) from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.core.converters import content_for_storage from swh.loader.package.utils import download logger = logging.getLogger(__name__) # Not implemented yet: # - clean up disk routines from previous killed workers (when OOMkilled) # -> separation of concern would like this to be abstracted from the code # -> experience tells us it's complicated to do as such (T903, T964, T982, # etc...) # # - model: swh.model.merkle.from_disk should output swh.model.model.* objects # to avoid this layer's conversion routine call # -> Take this up within swh.model's current implementation class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: """Build the revision dict from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: a_metadata: Artifact metadata uncompressed_path: Artifact uncompressed path on disk Returns: SWH data dict """ return {} def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return '' def last_snapshot(self) -> Optional[Dict]: """Retrieve the last snapshot """ snapshot = None visit = self.storage.origin_visit_get_latest( self.url, require_snapshot=True) if visit: snapshot = snapshot_get_all_branches( self.storage, visit['snapshot']) return snapshot def known_artifacts(self, snapshot: Optional[Dict]) -> Dict: """Retrieve the known releases/artifact for the origin. Args snapshot: snapshot for the visit Returns: Dict of keys revision id (bytes), values a metadata Dict. """ if not snapshot or 'branches' not in snapshot: return {} # retrieve only revisions (e.g the alias we do not want here) revs = [rev['target'] for rev in snapshot['branches'].values() if rev and rev['target_type'] == 'revision'] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: # revision_get can return None continue ret[revision['id']] = revision['metadata'] return ret def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: """Resolve the revision from a snapshot and an artifact metadata dict. If the artifact has already been downloaded, this will return the existing revision targeting that uncompressed artifact directory. Otherwise, this returns None. Args: snapshot: Snapshot artifact_metadata: Information dict Returns: None or revision identifier """ return None def download_package(self, p_info: Mapping[str, Any], tmpdir: str) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ a_uri = p_info['url'] filename = p_info.get('filename') return [download(a_uri, dest=tmpdir, filename=filename)] def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, 'src') for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'full' # either: partial, full tmp_revisions = {} # type: Dict[str, List] snapshot = None try: # Prepare origin and origin_visit origin = {'url': self.url} self.storage.origin_add_one(origin) visit_id = self.storage.origin_visit_add( origin=self.url, date=self.visit_date, type=self.visit_type)['visit'] last_snapshot = self.last_snapshot() logger.debug('last snapshot: %s', last_snapshot) known_artifacts = self.known_artifacts(last_snapshot) logger.debug('known artifacts: %s', known_artifacts) # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug('default version: %s', default_version) for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `p_` stands for `package_` for branch_name, p_info in self.get_package_info(version): logger.debug('package_info: %s', p_info) revision_id = self.resolve_revision_from( known_artifacts, p_info['raw']) if revision_id is None: with tempfile.TemporaryDirectory() as tmpdir: try: dl_artifacts = self.download_package( p_info, tmpdir) except Exception: logger.exception('Unable to retrieve %s', p_info) status_visit = 'partial' continue uncompressed_path = self.uncompress( dl_artifacts, dest=tmpdir) logger.debug('uncompressed_path: %s', uncompressed_path) directory = Directory.from_disk( path=uncompressed_path.encode('utf-8'), data=True) # noqa # FIXME: Try not to load the full raw content in # memory objects = directory.collect() contents = objects['content'].values() logger.debug('Number of contents: %s', len(contents)) self.storage.content_add( [content_for_storage(x) for x in contents]) status_load = 'eventful' directories = list(objects['directory'].values()) logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 revision = self.build_revision( p_info['raw'], uncompressed_path) revision.update({ 'synthetic': True, 'directory': directory.hash, }) revision['metadata'].update({ 'original_artifact': [ hashes for _, hashes in dl_artifacts ], }) revision['id'] = revision_id = identifier_to_bytes( revision_identifier(revision)) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) tmp_revisions[version].append((branch_name, revision_id)) logger.debug('tmp_revisions: %s', tmp_revisions) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_revisions in tmp_revisions.items(): if version == default_version and \ len(branch_name_revisions) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_revisions[0] # except for some corner case (deposit) if branch_name != 'HEAD': branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name.encode('utf-8'), } for branch_name, target in branch_name_revisions: branches[branch_name.encode('utf-8')] = { 'target_type': 'revision', 'target': target, } snapshot = { 'branches': branches } logger.debug('snapshot: %s', snapshot) snapshot['id'] = identifier_to_bytes( snapshot_identifier(snapshot)) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) if hasattr(self.storage, 'flush'): self.storage.flush() except Exception: logger.exception('Fail to load %s' % self.url) status_visit = 'partial' status_load = 'failed' finally: self.storage.origin_visit_update( origin=self.url, visit_id=visit_id, status=status_visit, snapshot=snapshot and snapshot['id']) result = { 'status': status_load, } # type: Dict[str, Any] if snapshot: - result['snapshot_id'] = snapshot['id'] + result['snapshot_id'] = hash_to_hex(snapshot['id']) return result diff --git a/swh/loader/package/tests/test_debian.py b/swh/loader/package/tests/test_debian.py index 67ba46f..0c6d39d 100644 --- a/swh/loader/package/tests/test_debian.py +++ b/swh/loader/package/tests/test_debian.py @@ -1,374 +1,388 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import pytest from os import path from swh.loader.package.debian import ( DebianLoader, download_package, dsc_information, uid_to_person, prepare_person, get_package_metadata, extract_package ) from swh.loader.package.tests.common import check_snapshot, get_stats logger = logging.getLogger(__name__) PACKAGE_FILES = { 'name': 'cicero', 'version': '0.7.2-3', 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa 'size': 3964, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa 'size': 1864, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa 'size': 96527, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa } }, } PACKAGE_FILES2 = { 'name': 'cicero', 'version': '0.7.2-4', 'files': { 'cicero_0.7.2-4.diff.gz': { 'md5sum': '1e7e6fc4a59d57c98082a3af78145734', 'name': 'cicero_0.7.2-4.diff.gz', 'sha256': '2e6fa296ee7005473ff58d0971f4fd325617b445671480e9f2cfb738d5dbcd01', # noqa 'size': 4038, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.diff.gz' # noqa }, 'cicero_0.7.2-4.dsc': { 'md5sum': '1a6c8855a73b4282bb31d15518f18cde', 'name': 'cicero_0.7.2-4.dsc', 'sha256': '913ee52f7093913420de5cbe95d63cfa817f1a1daf997961149501894e754f8b', # noqa 'size': 1881, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.dsc'}, # noqa 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa 'size': 96527, 'uri': 'http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa } } } PACKAGE_PER_VERSION = { 'stretch/contrib/0.7.2-3': PACKAGE_FILES, } PACKAGES_PER_VERSION = { 'stretch/contrib/0.7.2-3': PACKAGE_FILES, 'buster/contrib/0.7.2-4': PACKAGE_FILES2, } def test_debian_first_visit( swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = DebianLoader( url='deb://Debian/packages/cicero', date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGE_PER_VERSION) actual_load_status = loader.load() - assert actual_load_status['status'] == 'eventful' + expected_snapshot_id = '3b6b66e6ee4e7d903a379a882684a2a50480c0b4' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } stats = get_stats(loader.storage) assert { 'content': 42, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, # all artifacts under 1 revision 'skipped_content': 0, 'snapshot': 1 } == stats expected_snapshot = { - 'id': '3b6b66e6ee4e7d903a379a882684a2a50480c0b4', + 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', } }, } # different than the previous loader as no release is done check_snapshot(expected_snapshot, loader.storage) def test_debian_first_visit_then_another_visit( swh_config, requests_mock_datadir): - """With no prior visit, load a gnu project ends up with 1 snapshot + """With no prior visit, load a debian project ends up with 1 snapshot """ url = 'deb://Debian/packages/cicero' loader = DebianLoader( url=url, date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGE_PER_VERSION) actual_load_status = loader.load() - assert actual_load_status['status'] == 'eventful' + + expected_snapshot_id = '3b6b66e6ee4e7d903a379a882684a2a50480c0b4' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } + origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deb' stats = get_stats(loader.storage) assert { 'content': 42, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, # all artifacts under 1 revision 'skipped_content': 0, 'snapshot': 1 } == stats expected_snapshot = { - 'id': '3b6b66e6ee4e7d903a379a882684a2a50480c0b4', + 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', } }, } # different than the previous loader as no release is done check_snapshot(expected_snapshot, loader.storage) # No change in between load actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' origin_visit2 = list(loader.storage.origin_visit_get(url)) assert origin_visit2[-1]['status'] == 'full' assert origin_visit2[-1]['type'] == 'deb' stats2 = get_stats(loader.storage) assert { 'content': 42 + 0, 'directory': 2 + 0, 'origin': 1, 'origin_visit': 1 + 1, # a new visit occurred 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, # same snapshot across 2 visits } == stats2 urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('http://deb.debian.org') ] # visited each package artifact twice across 2 visits assert len(urls) == len(set(urls)) def test_uid_to_person(): uid = 'Someone Name ' actual_person = uid_to_person(uid) assert actual_person == { 'name': 'Someone Name', 'email': 'someone@orga.org', 'fullname': uid, } def test_prepare_person(): actual_author = prepare_person({ 'name': 'Someone Name', 'email': 'someone@orga.org', 'fullname': 'Someone Name ', }) assert actual_author == { 'name': b'Someone Name', 'email': b'someone@orga.org', 'fullname': b'Someone Name ', } def test_download_package(datadir, tmpdir, requests_mock_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) all_hashes = download_package(PACKAGE_FILES, tmpdir) assert all_hashes == { 'cicero_0.7.2-3.diff.gz': { 'checksums': { 'blake2s256': '08b1c438e70d2474bab843d826515147fa4a817f8c4baaf3ddfbeb5132183f21', # noqa 'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc', 'sha1_git': '834ac91da3a9da8f23f47004bb456dd5bd16fe49', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa }, 'filename': 'cicero_0.7.2-3.diff.gz', 'length': 3964}, 'cicero_0.7.2-3.dsc': { 'checksums': { 'blake2s256': '8c002bead3e35818eaa9d00826f3d141345707c58fb073beaa8abecf4bde45d2', # noqa 'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd', 'sha1_git': '1f94b2086fa1142c2df6b94092f5c5fa11093a8e', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa }, 'filename': 'cicero_0.7.2-3.dsc', 'length': 1864}, 'cicero_0.7.2.orig.tar.gz': { 'checksums': { 'blake2s256': '9809aa8d2e2dad7f34cef72883db42b0456ab7c8f1418a636eebd30ab71a15a6', # noqa 'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43', 'sha1_git': 'aa0a38978dce86d531b5b0299b4a616b95c64c74', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa }, 'filename': 'cicero_0.7.2.orig.tar.gz', 'length': 96527 } } def test_dsc_information_ok(): fname = 'cicero_0.7.2-3.dsc' dsc_url, dsc_name = dsc_information(PACKAGE_FILES) assert dsc_url == PACKAGE_FILES['files'][fname]['uri'] assert dsc_name == PACKAGE_FILES['files'][fname]['name'] def test_dsc_information_not_found(): fname = 'cicero_0.7.2-3.dsc' package_files = copy.deepcopy(PACKAGE_FILES) package_files['files'].pop(fname) dsc_url, dsc_name = dsc_information(package_files) assert dsc_url is None assert dsc_name is None def test_dsc_information_too_many_dsc_entries(): # craft an extra dsc file fname = 'cicero_0.7.2-3.dsc' package_files = copy.deepcopy(PACKAGE_FILES) data = package_files['files'][fname] fname2 = fname.replace('cicero', 'ciceroo') package_files['files'][fname2] = data with pytest.raises( ValueError, match='Package %s_%s references several dsc' % ( package_files['name'], package_files['version'])): dsc_information(package_files) def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path): tmp_path = str(tmp_path) # py3.5 compat. package = PACKAGE_FILES logger.debug('package: %s', package) # download the packages all_hashes = download_package(package, tmp_path) # Retrieve information from package _, dsc_name = dsc_information(package) dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()] # Extract information from package extracted_path = extract_package(dl_artifacts, tmp_path) # Retrieve information on package dsc_path = path.join(path.dirname(extracted_path), dsc_name) actual_package_info = get_package_metadata( package, dsc_path, extracted_path) logger.debug('actual_package_info: %s', actual_package_info) assert actual_package_info == { 'changelog': { 'date': '2014-10-19T16:52:35+02:00', 'history': [ ('cicero', '0.7.2-2'), ('cicero', '0.7.2-1'), ('cicero', '0.7-1') ], 'person': { 'email': 'sthibault@debian.org', 'fullname': 'Samuel Thibault ', 'name': 'Samuel Thibault' } }, 'maintainers': [ { 'email': 'debian-accessibility@lists.debian.org', 'fullname': 'Debian Accessibility Team ' '', 'name': 'Debian Accessibility Team' }, { 'email': 'sthibault@debian.org', 'fullname': 'Samuel Thibault ', 'name': 'Samuel Thibault' } ], 'name': 'cicero', 'version': '0.7.2-3' } def test_debian_multiple_packages(swh_config, requests_mock_datadir): url = 'deb://Debian/packages/cicero' loader = DebianLoader( url=url, date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGES_PER_VERSION) actual_load_status = loader.load() - assert actual_load_status['status'] == 'eventful' + expected_snapshot_id = 'defc19021187f3727293121fcf6c5c82cb923604' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id + } origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deb' expected_snapshot = { - 'id': 'defc19021187f3727293121fcf6c5c82cb923604', + 'id': expected_snapshot_id, 'branches': { 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', }, 'releases/buster/contrib/0.7.2-4': { 'target_type': 'revision', 'target': '8224139c274c984147ef4b09aa0e462c55a10bd3', } }, } check_snapshot(expected_snapshot, loader.storage) diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py index 8e580e9..bbba451 100644 --- a/swh/loader/package/tests/test_deposit.py +++ b/swh/loader/package/tests/test_deposit.py @@ -1,204 +1,210 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from swh.model.hashutil import hash_to_bytes from swh.loader.package.deposit import DepositLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) from swh.core.pytest_plugin import requests_mock_datadir_factory def test_deposit_init_ok(swh_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.archive_url == '/%s/raw/' % deposit_id assert loader.metadata_url == '/%s/meta/' % deposit_id assert loader.deposit_update_url == '/%s/update/' % deposit_id assert loader.client is not None def test_deposit_loading_failure_to_fetch_metadata(swh_config): """Error during fetching artifact ends us with failed/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url' unknown_deposit_id = 666 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() - assert actual_load_status['status'] == 'failed' + assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://deposit.softwareheritage.org/1/private/666/raw/', ]) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url-2' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' + assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' def test_revision_metadata_structure(swh_config, requests_mock_datadir): # do not care for deposit update query requests_mock_datadir.put(re.compile('https')) url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' + assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '9471c606239bccb1f269564c9ea114e1eeab9eb4') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_deposit_loading_ok(swh_config, requests_mock_datadir): requests_mock_datadir.put(re.compile('https')) # do not care for put url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() - assert actual_load_status['status'] == 'eventful' + expected_snapshot_id = '453f455d0efb69586143cd6b6e5897f9906b53a7' + assert actual_load_status == { + 'status': 'eventful', + 'snapshot_id': expected_snapshot_id, + } stats = get_stats(loader.storage) assert { 'content': 303, 'directory': 12, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' expected_branches = { 'HEAD': { 'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4', 'target_type': 'revision', }, } expected_snapshot = { - 'id': '453f455d0efb69586143cd6b6e5897f9906b53a7', + 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) # check metadata tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2", } } tool = loader.storage.tool_get(tool) assert tool is not None assert tool['id'] is not None provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } provider = loader.storage.metadata_provider_get_by(provider) assert provider is not None assert provider['id'] is not None metadata = list(loader.storage.origin_metadata_get_by( url, provider_type='deposit_client')) assert metadata is not None assert isinstance(metadata, list) assert len(metadata) == 1 metadata0 = metadata[0] assert metadata0['provider_id'] == provider['id'] assert metadata0['provider_type'] == 'deposit_client' assert metadata0['tool_id'] == tool['id']