Page MenuHomeSoftware Heritage

D2479.id8829.diff
No OneTemporary

D2479.id8829.diff

diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py
--- a/swh/loader/package/debian/loader.py
+++ b/swh/loader/package/debian/loader.py
@@ -89,33 +89,11 @@
yield release_name(version), p_info
def resolve_revision_from(
- self, known_package_artifacts: Dict, artifact_metadata: Dict) \
+ self, known_package_artifacts: Mapping,
+ artifact_metadata: Mapping) \
-> Optional[bytes]:
- artifacts_to_fetch = artifact_metadata['files']
- logger.debug('k_p_artifacts: %s', known_package_artifacts)
- logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch)
- for rev_id, known_artifacts in known_package_artifacts.items():
- logger.debug('Revision: %s', rev_id)
- logger.debug('Associated known_artifacts: %s', known_artifacts)
- known_artifacts = known_artifacts['extrinsic']['raw']['files']
- rev_found = True
- for a_name, k_artifact in known_artifacts.items():
- artifact_to_fetch = artifacts_to_fetch.get(a_name)
- logger.debug('artifact_to_fetch: %s', artifact_to_fetch)
- if artifact_to_fetch is None:
- # as soon as we do not see an artifact, we consider we need
- # to check the other revision
- rev_found = False
- if k_artifact['sha256'] != artifact_to_fetch['sha256']:
- # Hash is different, we consider we need to check the other
- # revisions
- rev_found = False
- if rev_found:
- logger.debug('Existing revision %s found for new artifacts.',
- rev_id)
- return rev_id
- logger.debug('No existing revision found for the new artifacts.')
- return None
+ return resolve_revision_from(
+ known_package_artifacts, artifact_metadata)
def download_package(self, p_info: Mapping[str, Any],
tmpdir: str) -> List[Tuple[str, Mapping]]:
@@ -183,6 +161,38 @@
}
+def resolve_revision_from(known_package_artifacts: Mapping,
+ artifact_metadata: Mapping) -> Optional[bytes]:
+ """Given known package artifacts (resolved from the snapshot of previous
+ visit) and the new artifact to fetch, try to solve the corresponding
+ revision.
+
+ """
+ artifacts_to_fetch = artifact_metadata.get('files')
+ if not artifacts_to_fetch:
+ return None
+
+ def to_set(data):
+ return frozenset([
+ (name, meta['sha256'], meta['size'])
+ for name, meta in data['files'].items()
+ ])
+
+ # what we want to avoid downloading back if we have them already
+ set_new_artifacts = to_set(artifact_metadata)
+
+ known_artifacts_revision_id = {}
+ for rev_id, known_artifacts in known_package_artifacts.items():
+ extrinsic = known_artifacts.get('extrinsic')
+ if not extrinsic:
+ continue
+
+ s = to_set(extrinsic['raw'])
+ known_artifacts_revision_id[s] = rev_id
+
+ return known_artifacts_revision_id.get(set_new_artifacts)
+
+
def uid_to_person(uid: str) -> Mapping[str, str]:
"""Convert an uid to a person suitable for insertion.
diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py
--- a/swh/loader/package/debian/tests/test_debian.py
+++ b/swh/loader/package/debian/tests/test_debian.py
@@ -6,6 +6,7 @@
import copy
import logging
import pytest
+import random
from os import path
@@ -14,6 +15,7 @@
prepare_person, get_package_metadata, extract_package
)
from swh.loader.package.tests.common import check_snapshot, get_stats
+from swh.loader.package.debian.loader import resolve_revision_from
logger = logging.getLogger(__name__)
@@ -386,3 +388,81 @@
}
check_snapshot(expected_snapshot, loader.storage)
+
+
+def test_resolve_revision_from_edge_cases():
+ """Solving revision with empty data will result in unknown revision
+
+ """
+ for package_artifacts in [{}, PACKAGE_FILES]:
+ actual_revision = resolve_revision_from(
+ package_artifacts, {})
+ assert actual_revision is None
+
+ for known_artifacts in [{}, PACKAGE_FILES]:
+ actual_revision = resolve_revision_from(
+ {}, known_artifacts)
+ assert actual_revision is None
+
+ known_package_artifacts = {
+ b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07": {
+ 'extrinsic': {
+ # empty
+ },
+ # ... removed the unnecessary intermediary data
+ }
+ }
+ assert not resolve_revision_from(known_package_artifacts, PACKAGE_FILES)
+
+
+def test_resolve_revision_from_edge_cases_hit_and_miss():
+ """Solving revision with inconsistent data will result in unknown revision
+
+ """
+ artifact_metadata = PACKAGE_FILES2
+ expected_revision_id = b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa
+ known_package_artifacts = {
+ expected_revision_id: {
+ 'extrinsic': {
+ 'raw': PACKAGE_FILES,
+ },
+ # ... removed the unnecessary intermediary data
+ }
+ }
+
+ actual_revision = resolve_revision_from(
+ known_package_artifacts, artifact_metadata
+ )
+
+ assert actual_revision is None
+
+
+def test_resolve_revision_from():
+ """Solving revision with consistent data will solve the revision
+
+ """
+ artifact_metadata = PACKAGE_FILES
+ expected_revision_id = b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa
+
+ files = artifact_metadata['files']
+ # shuffling dict's keys
+ keys = list(files.keys())
+ random.shuffle(keys)
+ package_files = {
+ 'files': {k: files[k] for k in keys}
+ }
+
+ known_package_artifacts = {
+ expected_revision_id: {
+ 'extrinsic': {
+ 'raw': package_files,
+ },
+ # ... removed the unnecessary intermediary data
+ }
+ }
+
+ actual_revision = resolve_revision_from(
+ known_package_artifacts, artifact_metadata
+ )
+
+ assert actual_revision == expected_revision_id

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 10:29 AM (2 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222816

Event Timeline