Differential D4189 Diff 14784 swh/indexer/metadata.py

Changeset View

Standalone View

swh/indexer/metadata.py

Show All 17 Lines

from swh.core.config import merge_configs		from swh.core.config import merge_configs
from swh.core.utils import grouper		from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents		from swh.indexer.codemeta import merge_documents
from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer		from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
from swh.indexer.metadata_detector import detect_metadata		from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS		from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.origin_head import OriginHeadIndexer		from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.storage import INDEXER_CFG_KEY		from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (		from swh.indexer.storage.model import (
ContentMetadataRow,		ContentMetadataRow,
OriginIntrinsicMetadataRow,		OriginIntrinsicMetadataRow,
RevisionIntrinsicMetadataRow,		RevisionIntrinsicMetadataRow,
)		)
from swh.model import hashutil		from swh.model import hashutil
from swh.model.model import Revision		from swh.model.model import Revision, Sha1Git

REVISION_GET_BATCH_SIZE = 10		REVISION_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10		ORIGIN_GET_BATCH_SIZE = 10


T1 = TypeVar("T1")		T1 = TypeVar("T1")
T2 = TypeVar("T2")		T2 = TypeVar("T2")


def call_with_batches(		def call_with_batches(
f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int,		f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int,
) -> Iterator[T2]:		) -> Iterator[T2]:
"""Calls a function with batches of args, and concatenates the results.		"""Calls a function with batches of args, and concatenates the results.
"""		"""
groups = grouper(args, batch_size)		groups = grouper(args, batch_size)
for group in groups:		for group in groups:
yield from f(list(group))		yield from f(list(group))


class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):		class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
		ardumontUnsubmitted Done Inline Actions For my understanding, so now the generic TId, TData and TResult are now formalized into respectively: TId: Sha1 TData: bytes TResult: ContentMetadataRow TId and TData because of ContentIndexer TResult because of this declaration. Did i get this right? .oO(That's neat) ardumont: For my understanding, so now the generic TId, TData and TResult are now formalized into…
		vlorentzAuthorUnsubmitted Done Inline Actions yes vlorentz: yes
"""Content-level indexer		"""Content-level indexer

This indexer is in charge of:		This indexer is in charge of:

- filtering out content already indexed in content_metadata		- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1		- reading content from objstorage with the content's id sha1
- computing metadata by given context		- computing metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool		- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table		- store result in content_metadata table

"""		"""

def filter(self, ids):		def filter(self, ids):
"""Filter out known sha1s and return only missing ones.		"""Filter out known sha1s and return only missing ones.
"""		"""
yield from self.idx_storage.content_metadata_missing(		yield from self.idx_storage.content_metadata_missing(
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)		({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)
)		)

def index(		def index(
self, id, data: Optional[bytes] = None, log_suffix="unknown revision", **kwargs		self,
		id: Sha1,
		data: Optional[bytes] = None,
		log_suffix="unknown revision",
		**kwargs,
) -> List[ContentMetadataRow]:		) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.		"""Index sha1s' content and store result.

Args:		Args:
id (bytes): content's identifier		id (bytes): content's identifier
data (bytes): raw content in bytes		data (bytes): raw content in bytes

Returns:		Returns:
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	def filter(self, sha1_gits):
"""		"""
yield from self.idx_storage.revision_intrinsic_metadata_missing(		yield from self.idx_storage.revision_intrinsic_metadata_missing(
(		(
{"id": sha1_git, "indexer_configuration_id": self.tool["id"],}		{"id": sha1_git, "indexer_configuration_id": self.tool["id"],}
for sha1_git in sha1_gits		for sha1_git in sha1_gits
)		)
)		)

def index(self, id, data=None, **kwargs) -> List[RevisionIntrinsicMetadataRow]:		def index(
		self, id: Sha1Git, data: Optional[Revision], **kwargs
		) -> List[RevisionIntrinsicMetadataRow]:
"""Index rev by processing it and organizing result.		"""Index rev by processing it and organizing result.

use metadata_detector to iterate on filenames		use metadata_detector to iterate on filenames

- if one filename detected -> sends file to content indexer		- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level		- if multiple file detected -> translation needed at revision level

Args:		Args:
rev: revision model object from storage		id: sha1_git of the revision
		data: revision model object from storage

Returns:		Returns:
dict: dictionary representing a revision_intrinsic_metadata, with		dict: dictionary representing a revision_intrinsic_metadata, with
keys:		keys:

- id (str): rev's identifier (sha1_git)		- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used		- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata		- metadata: dict of retrieved metadata

"""		"""
rev = id		rev = data
assert isinstance(rev, Revision)		assert isinstance(rev, Revision)
assert data is None

try:		try:
root_dir = rev.directory		root_dir = rev.directory
dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))		dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
if [entry["type"] for entry in dir_ls] == ["dir"]:		if [entry["type"] for entry in dir_ls] == ["dir"]:
# If the root is just a single directory, recurse into it		# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs		# eg. PyPI packages, GNU tarballs
subdir = dir_ls[0]["target"]		subdir = dir_ls[0]["target"]
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines	) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
assert len(head_revs) == len(head_rev_ids)		assert len(head_revs) == len(head_rev_ids)

results = []		results = []
for (origin, rev) in zip(origins_with_head, head_revs):		for (origin, rev) in zip(origins_with_head, head_revs):
if not rev:		if not rev:
self.log.warning("Missing head revision of origin %r", origin.url)		self.log.warning("Missing head revision of origin %r", origin.url)
continue		continue

for rev_metadata in self.revision_metadata_indexer.index(rev):		for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
# There is at most one rev_metadata		# There is at most one rev_metadata
orig_metadata = OriginIntrinsicMetadataRow(		orig_metadata = OriginIntrinsicMetadataRow(
from_revision=rev_metadata.id,		from_revision=rev_metadata.id,
id=origin.url,		id=origin.url,
metadata=rev_metadata.metadata,		metadata=rev_metadata.metadata,
mappings=rev_metadata.mappings,		mappings=rev_metadata.mappings,
indexer_configuration_id=rev_metadata.indexer_configuration_id,		indexer_configuration_id=rev_metadata.indexer_configuration_id,
)		)
▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines