Changeset View
Changeset View
Standalone View
Standalone View
swh/web/utils/archive.py
Show First 20 Lines • Show All 330 Lines • ▼ Show 20 Lines | else: | ||||
regexp=True, | regexp=True, | ||||
) | ) | ||||
origins = [converters.from_origin(ori.to_dict()) for ori in page_result.results] | origins = [converters.from_origin(ori.to_dict()) for ori in page_result.results] | ||||
return (origins, page_result.next_page_token) | return (origins, page_result.next_page_token) | ||||
def search_origin_metadata( | def search_origin_metadata( | ||||
fulltext: str, limit: int = 50 | fulltext: str, limit: int = 50, return_metadata: bool = True | ||||
) -> Iterable[OriginMetadataInfo]: | ) -> Iterable[OriginMetadataInfo]: | ||||
"""Search for origins whose metadata match a provided string pattern. | """Search for origins whose metadata match a provided string pattern. | ||||
Args: | Args: | ||||
fulltext: the string pattern to search for in origin metadata | fulltext: the string pattern to search for in origin metadata | ||||
limit: the maximum number of found origins to return | limit: the maximum number of found origins to return | ||||
return_metadata: if false, will only return the origin URL | |||||
Returns: | Returns: | ||||
Iterable of origin metadata information for existing origins | Iterable of origin metadata information for existing origins | ||||
""" | """ | ||||
results = [] | results = [] | ||||
if ( | if ( | ||||
search | search | ||||
and config.get_config()["search_config"]["metadata_backend"] == "swh-search" | and config.get_config()["search_config"]["metadata_backend"] == "swh-search" | ||||
): | ): | ||||
page_result = search.origin_search( | page_result = search.origin_search( | ||||
metadata_pattern=fulltext, | metadata_pattern=fulltext, | ||||
limit=limit, | limit=limit, | ||||
) | ) | ||||
origin_urls = [r["url"] for r in page_result.results] | origin_urls = [r["url"] for r in page_result.results] | ||||
if return_metadata: | |||||
metadata = { | metadata = { | ||||
r.id: r for r in idx_storage.origin_intrinsic_metadata_get(origin_urls) | r.id: r for r in idx_storage.origin_intrinsic_metadata_get(origin_urls) | ||||
} | } | ||||
else: | |||||
# Skip query to swh-indexer if we are not interested in the results | |||||
metadata = {} | |||||
# Results from swh-search are not guaranteed to be in | # Results from swh-search are not guaranteed to be in | ||||
# idx_storage.origin_intrinsic_metadata (typically when they come from | # idx_storage.origin_intrinsic_metadata (typically when they come from | ||||
# extrinsic metadata; or when the swh-indexer cache is cleared). | # extrinsic metadata; or when the swh-indexer cache is cleared). | ||||
# When they are missing, we only return the origin url. | # When they are missing, we only return the origin url. | ||||
matches = [ | matches = [ | ||||
metadata[url].to_dict() if url in metadata else {"id": url} | metadata[url].to_dict() if url in metadata else {"id": url} | ||||
for url in origin_urls | for url in origin_urls | ||||
Show All 15 Lines | for origin, match in zip(origins, matches): | ||||
# This may occur when the storage database we use is lagging behind | # This may occur when the storage database we use is lagging behind | ||||
# swh-search | # swh-search | ||||
continue | continue | ||||
for field in ("from_directory", "from_revision"): | for field in ("from_directory", "from_revision"): | ||||
# from_directory when using swh.indexer >= 2.0.0, from_revision otherwise | # from_directory when using swh.indexer >= 2.0.0, from_revision otherwise | ||||
if field in match: | if field in match: | ||||
match[field] = hashutil.hash_to_hex(match[field]) | match[field] = hashutil.hash_to_hex(match[field]) | ||||
del match["id"] | del match["id"] | ||||
if not return_metadata: | |||||
match.pop("metadata", None) | |||||
results.append(OriginMetadataInfo(url=origin.url, metadata=match)) | results.append(OriginMetadataInfo(url=origin.url, metadata=match)) | ||||
return results | return results | ||||
def lookup_origin_intrinsic_metadata(origin_url: str) -> Dict[str, Any]: | def lookup_origin_intrinsic_metadata(origin_url: str) -> Dict[str, Any]: | ||||
"""Return intrinsic metadata for origin whose origin matches given | """Return intrinsic metadata for origin whose origin matches given | ||||
origin. | origin. | ||||
▲ Show 20 Lines • Show All 1,050 Lines • Show Last 20 Lines |