Page MenuHomeSoftware Heritage

D8060.diff
No OneTemporary

D8060.diff

diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -44,6 +44,7 @@
directory: List[Dict]
origin: List[Dict]
origin_visit_status: List[Dict]
+ raw_extrinsic_metadata: List[Dict]
@contextmanager
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -16,13 +16,20 @@
TypeVar,
cast,
)
+from urllib.parse import urlparse
import sentry_sdk
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
+from swh.indexer.indexer import (
+ BaseIndexer,
+ ContentIndexer,
+ DirectoryIndexer,
+ ObjectsDict,
+ OriginIndexer,
+)
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
@@ -31,13 +38,14 @@
from swh.indexer.storage.model import (
ContentMetadataRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Directory
+from swh.model.model import Directory, MetadataAuthorityType
from swh.model.model import ObjectType as ModelObjectType
-from swh.model.model import Origin, Sha1Git
-from swh.model.swhids import CoreSWHID, ObjectType
+from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
+from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
REVISION_GET_BATCH_SIZE = 10
RELEASE_GET_BATCH_SIZE = 10
@@ -59,6 +67,99 @@
yield from f(list(group))
+class ExtrinsicMetadataIndexer(
+ BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
+):
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ summary: Dict[str, Any] = {"status": "uneventful"}
+ try:
+ results = []
+ for item in objects.get("raw_extrinsic_metadata", []):
+ results.extend(
+ self.index(item["id"], data=RawExtrinsicMetadata.from_dict(item))
+ )
+ except Exception:
+ if not self.catch_exceptions:
+ raise
+ summary["status"] = "failed"
+ return summary
+
+ summary_persist = self.persist_index_computations(results)
+ self.results = results
+ if summary_persist:
+ for value in summary_persist.values():
+ if value > 0:
+ summary["status"] = "eventful"
+ summary.update(summary_persist)
+ return summary
+
+ def index(
+ self,
+ id: Sha1Git,
+ data: Optional[RawExtrinsicMetadata],
+ **kwargs,
+ ) -> List[OriginExtrinsicMetadataRow]:
+ if data is None:
+ raise NotImplementedError(
+ "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
+ )
+ if data.target.object_type != ExtendedObjectType.ORIGIN:
+ # other types are not supported yet
+ return []
+
+ if data.authority.type != MetadataAuthorityType.FORGE:
+ # metadata provided by a third-party; don't trust it
+ # (technically this could be handled below, but we check it here
+ # to return early; sparing a translation and origin lookup)
+ # TODO: add ways to define trusted authorities
+ return []
+
+ metadata_items = []
+ mappings = []
+ for (mapping_name, mapping) in MAPPINGS.items():
+ if data.format in mapping.extrinsic_metadata_formats():
+ metadata_item = mapping().translate(data.metadata)
+ if metadata_item is not None:
+ metadata_items.append(metadata_item)
+ mappings.append(mapping_name)
+
+ if not metadata_items:
+ # Don't have any mapping to parse it, ignore
+ return []
+
+ # TODO: batch requests to origin_get_by_sha1()
+ origins = self.storage.origin_get_by_sha1([data.target.object_id])
+ try:
+ (origin,) = origins
+ if origin is None:
+ raise ValueError()
+ except ValueError:
+ raise ValueError(f"Unknown origin {data.target}") from None
+
+ if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
+ # metadata provided by a third-party; don't trust it
+ # TODO: add ways to define trusted authorities
+ return []
+
+ metadata = merge_documents(metadata_items)
+
+ return [
+ OriginExtrinsicMetadataRow(
+ id=origin["url"],
+ indexer_configuration_id=self.tool["id"],
+ from_remd_id=data.id,
+ mappings=mappings,
+ metadata=metadata,
+ )
+ ]
+
+ def persist_index_computations(
+ self, results: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ """Persist the results in storage."""
+ return self.idx_storage.origin_extrinsic_metadata_add(results)
+
+
class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
"""Content-level indexer
@@ -129,15 +230,7 @@
def persist_index_computations(
self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
- """Persist the results in storage.
-
- Args:
- results: list of content_metadata, dict with the
- following keys:
- - id (bytes): content's identifier (sha1)
- - metadata (jsonb): detected metadata
-
- """
+ """Persist the results in storage."""
return self.idx_storage.content_metadata_add(results)
@@ -239,16 +332,7 @@
def persist_index_computations(
self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- """Persist the results in storage.
-
- Args:
- results: list of content_mimetype, dict with the
- following keys:
- - id (bytes): content's identifier (sha1)
- - mimetype (bytes): mimetype in bytes
- - encoding (bytes): encoding in bytes
-
- """
+ """Persist the results in storage."""
# TODO: add functions in storage to keep data in
# directory_intrinsic_metadata
return self.idx_storage.directory_intrinsic_metadata_add(results)
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -77,6 +77,7 @@
raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
def translate(self, file_content: bytes) -> Optional[Dict]:
+ """Translates intrinsic metadata, from the content of a file."""
raise NotImplementedError(f"{self.__class__.__name__}.translate")
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -6,7 +6,7 @@
import ast
import itertools
import re
-from typing import List
+from typing import List, Tuple
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
@@ -37,6 +37,11 @@
return [entry["sha1"]]
return []
+ @classmethod
+ def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
+ # this class is only used by intrinsic metadata mappings
+ return ()
+
def translate(self, raw_content):
try:
raw_content = raw_content.decode()
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -3,10 +3,31 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
-from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+import datetime
+from unittest.mock import call
+
+import attr
+
+from swh.indexer.metadata import (
+ ContentMetadataIndexer,
+ DirectoryMetadataIndexer,
+ ExtrinsicMetadataIndexer,
+)
+from swh.indexer.storage.model import (
+ ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
+)
from swh.indexer.tests.utils import DIRECTORY2
-from swh.model.model import Directory, DirectoryEntry
+from swh.model.model import (
+ Directory,
+ DirectoryEntry,
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ RawExtrinsicMetadata,
+)
+from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from .utils import (
BASE_TEST_CONFIG,
@@ -36,6 +57,24 @@
"tools": TRANSLATOR_TOOL,
}
+REMD = RawExtrinsicMetadata(
+ target=ExtendedSWHID(
+ object_type=ExtendedObjectType.ORIGIN,
+ object_id=b"\x01" * 20,
+ ),
+ discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
+ authority=MetadataAuthority(
+ type=MetadataAuthorityType.FORGE,
+ url="https://example.org/",
+ ),
+ fetcher=MetadataFetcher(
+ name="example-fetcher",
+ version="1.0.0",
+ ),
+ format="application/vnd.github.v3+json",
+ metadata=b'{"full_name": "test software"}',
+)
+
class TestMetadata:
"""
@@ -141,3 +180,91 @@
del result.tool["id"]
assert results == expected_results
+
+ def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
+ """Should be ignored when unknown format"""
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+
+ remd = attr.evolve(REMD, format="unknown format")
+
+ results = metadata_indexer.index(remd.id, data=remd)
+
+ assert metadata_indexer.storage.method_calls == []
+ assert results == []
+
+ def test_extrinsic_metadata_indexer_github(self, mocker):
+ """Nominal case, calling the mapping and storing the result"""
+ origin = "https://example.org/jdoe/myrepo"
+
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.catch_exceptions = False
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+ metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+ tool = metadata_indexer.idx_storage.indexer_configuration_get(
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+ )
+ assert tool is not None
+
+ assert metadata_indexer.process_journal_objects(
+ {"raw_extrinsic_metadata": [REMD.to_dict()]}
+ ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+ assert metadata_indexer.storage.method_calls == [
+ call.origin_get_by_sha1([b"\x01" * 20])
+ ]
+
+ results = list(
+ metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+ )
+ assert results == [
+ OriginExtrinsicMetadataRow(
+ id="https://example.org/jdoe/myrepo",
+ tool={"id": tool["id"], **TRANSLATOR_TOOL},
+ metadata={
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "https://forgefed.org/ns#Repository",
+ "name": "test software",
+ },
+ from_remd_id=REMD.id,
+ mappings=["GitHubMapping"],
+ )
+ ]
+
+ def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
+ """Early abort on non-forge authorities"""
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+
+ remd = attr.evolve(
+ REMD,
+ authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
+ )
+
+ results = metadata_indexer.index(remd.id, data=remd)
+
+ assert metadata_indexer.storage.method_calls == []
+ assert results == []
+
+ def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
+ """Should be ignored when authority URL does not match the origin"""
+
+ origin = "https://different-domain.example.org/jdoe/myrepo"
+
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.catch_exceptions = False
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+ metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+ tool = metadata_indexer.idx_storage.indexer_configuration_get(
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+ )
+ assert tool is not None
+
+ results = metadata_indexer.index(REMD.id, data=REMD)
+
+ assert metadata_indexer.storage.method_calls == [
+ call.origin_get_by_sha1([b"\x01" * 20])
+ ]
+ assert results == []

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 14, 3:44 AM (14 h, 15 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217068

Event Timeline