diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ hooks: - id: codespell name: Check source code spelling - exclude: ^(swh/loader/package/.*[/]+tests/data/.*)$ + exclude: ^(swh/loader/(package/.*[/]+|core)/tests/data/.*)$ args: [-L crate] entry: codespell --ignore-words-list=iff stages: [commit] diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -3,12 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import base64 import datetime import hashlib import logging import os import time from typing import Any, ContextManager, Dict, Iterable, List, Optional, Union +from urllib.parse import urlparse import sentry_sdk @@ -16,6 +18,7 @@ from swh.core.statsd import Statsd from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister from swh.loader.exception import NotFound +from swh.loader.package.utils import get_url_body from swh.model.model import ( BaseContent, Content, @@ -29,8 +32,11 @@ Sha1Git, SkippedContent, Snapshot, + SnapshotBranch, + TargetType, ) from swh.storage import get_storage +from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now @@ -273,7 +279,7 @@ """ return True - def store_data(self): + def store_data(self) -> None: """Store fetched data in the database. Should call the :func:`maybe_load_xyz` methods, which handle the @@ -640,3 +646,113 @@ self.storage.snapshot_add([snapshot]) self.flush() self.loaded_snapshot_id = snapshot.id + + +class ContentLoader(BaseLoader): + """Basic loader for edge case content ingestion. + + The "integrity" field is a normalized information about the checksum used and the + corresponding base64 hash encoded value of the content. + + The multiple "fallback" urls received for the same content are mirror urls so no + need to keep those. We only use them to fetch the actual content if the main origin + is no longer available. + + The output snapshot is of the form: + + .. code:: + + id: + branches: + HEAD: + target_type: content + target: + + """ + + visit_type = "content" + + def __init__( + self, *args, integrity: str, fallback_urls: List[str] = None, **kwargs + ): + super().__init__(*args, **kwargs) + self.fallback_urls = fallback_urls or [] + self.integrity: str = integrity + self.content: Optional[Content] = None + self.snapshot: Optional[Snapshot] = None + self.last_snapshot: Optional[Snapshot] = None + + def prepare(self) -> None: + self.last_snapshot = snapshot_get_latest(self.storage, self.origin.url) + + def fetch_data(self) -> bool: + """Retrieve the content file as a Content Object""" + urls = {self.origin.url, *self.fallback_urls} + # Determine the content checksum stored in the integrity field + # hash- + # https://w3c.github.io/webappsec-subresource-integrity/#grammardef-hash-algo + hash_algo, hash_value_b64 = self.integrity.split("-") + expected_checksum = base64.decodebytes(hash_value_b64.encode()) + data: Optional[bytes] = None + for url in urls: + url_ = urlparse(url) + self.log.debug( + "prepare; origin_url=%s fallback=%s scheme=%s path=%s", + self.origin.url, + url, + url_.scheme, + url_.path, + ) + try: + data = get_url_body(url) + self.content = Content.from_data(data) + + # Ensure content received matched the integrity field received + actual_checksum = self.content.get_hash(hash_algo) + if actual_checksum == expected_checksum: + # match, we have found our content to ingest, exit loop + break + # otherwise continue + except NotFound: + continue + + if not self.content: + raise NotFound(f"Unknown origin {self.origin.url}.") + + return False # no more data to fetch + + def process_data(self) -> bool: + """Build the snapshot out of the Content retrieved.""" + + assert self.content is not None + self.snapshot = Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=self.content.sha1_git, + target_type=TargetType.CONTENT, + ), + } + ) + + return False # no more data to process + + def store_data(self) -> None: + """Store newly retrieved Content and Snapshot.""" + assert self.content is not None + self.storage.content_add([self.content]) + assert self.snapshot is not None + self.storage.snapshot_add([self.snapshot]) + self.loaded_snapshot_id = self.snapshot.id + + def visit_status(self): + return "full" if self.content and self.snapshot is not None else "partial" + + def load_status(self) -> Dict[str, Any]: + return { + "status": "uneventful" + if self.last_snapshot == self.snapshot + else "eventful" + } + + def cleanup(self) -> None: + self.log.debug("cleanup") diff --git a/swh/loader/core/tests/data/https_common-lisp.net/project_asdf_archives_asdf-3.3.5.lisp b/swh/loader/core/tests/data/https_common-lisp.net/project_asdf_archives_asdf-3.3.5.lisp new file mode 100644 --- /dev/null +++ b/swh/loader/core/tests/data/https_common-lisp.net/project_asdf_archives_asdf-3.3.5.lisp @@ -0,0 +1 @@ +(print "hello-world") diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -1,8 +1,9 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import base64 import datetime import hashlib import logging @@ -15,6 +16,7 @@ SENTRY_ORIGIN_URL_TAG_NAME, SENTRY_VISIT_TYPE_TAG_NAME, BaseLoader, + ContentLoader, DVCSLoader, ) from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol @@ -306,10 +308,12 @@ assert save_path == expected_save_path -def _check_load_failure(caplog, loader, exc_class, exc_text, status="partial"): +def _check_load_failure( + caplog, loader, exc_class, exc_text, status="partial", origin=ORIGIN +): """Check whether a failed load properly logged its exception, and that the snapshot didn't get referenced in storage""" - assert isinstance(loader, DVCSLoader) # was implicit so far + assert isinstance(loader, (DVCSLoader, ContentLoader)) # was implicit so far for record in caplog.records: if record.levelname != "ERROR": continue @@ -319,11 +323,12 @@ assert isinstance(exc, exc_class) assert exc_text in exc.args[0] - # Check that the get_snapshot operation would have succeeded - assert loader.get_snapshot() is not None + if isinstance(loader, DVCSLoader): + # Check that the get_snapshot operation would have succeeded + assert loader.get_snapshot() is not None # And confirm that the visit doesn't reference a snapshot - visit = assert_last_visit_matches(loader.storage, ORIGIN.url, status) + visit = assert_last_visit_matches(loader.storage, origin.url, status) if status != "partial": assert visit.snapshot is None # But that the snapshot didn't get loaded @@ -503,3 +508,94 @@ sentry_tags = sentry_events[0]["tags"] assert sentry_tags.get(SENTRY_ORIGIN_URL_TAG_NAME) == ORIGIN.url assert sentry_tags.get(SENTRY_VISIT_TYPE_TAG_NAME) == DummyLoader.visit_type + + +CONTENT_MIRROR = "https://common-lisp.net" +CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp" +CONTENT_SHA256 = b"77bfa7d03eab048f68da87d630a6436640abfe7d5543202e24c553d5ff32e0a2" +CONTENT_INTEGRITY = f"sha256-{base64.encodebytes(CONTENT_SHA256).decode()}" + + +def test_content_loader_missing_field(swh_storage): + origin = Origin(CONTENT_URL) + with pytest.raises(TypeError, match="missing"): + ContentLoader(swh_storage, origin.url) + + +def test_content_loader_404(caplog, swh_storage, requests_mock_datadir): + unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") + loader = ContentLoader( + swh_storage, unknown_origin.url, integrity="sha256-unusedfornow" + ) + result = loader.load() + + assert result == {"status": "uneventful"} + + _check_load_failure( + caplog, + loader, + NotFound, + "Unknown origin", + status="not_found", + origin=unknown_origin, + ) + + +def test_content_loader_404_with_fallback(caplog, swh_storage, requests_mock_datadir): + unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") + fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" + loader = ContentLoader( + swh_storage, + unknown_origin.url, + fallback_urls=[fallback_url_ko], + integrity="sha256-unusedfornow", + ) + result = loader.load() + + assert result == {"status": "uneventful"} + + _check_load_failure( + caplog, + loader, + NotFound, + "Unknown origin", + status="not_found", + origin=unknown_origin, + ) + + +def test_content_loader_ok_with_fallback(caplog, swh_storage, requests_mock_datadir): + dead_origin = Origin(f"{CONTENT_MIRROR}/dead-origin-url") + fallback_url_ok = CONTENT_URL + fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" + + loader = ContentLoader( + swh_storage, + dead_origin.url, + fallback_urls=[fallback_url_ok, fallback_url_ko], + integrity=CONTENT_INTEGRITY, + ) + result = loader.load() + + assert result == {"status": "eventful"} + + +def test_content_loader_ok_simple(swh_storage, requests_mock_datadir): + origin = Origin(CONTENT_URL) + loader = ContentLoader( + swh_storage, + origin.url, + integrity=CONTENT_INTEGRITY, + ) + result = loader.load() + + assert result == {"status": "eventful"} + + visit_status = assert_last_visit_matches( + swh_storage, origin.url, status="full", type="content" + ) + assert visit_status.snapshot is not None + + result2 = loader.load() + + assert result2 == {"status": "uneventful"}