diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -165,6 +165,9 @@ "has_visits": {"type": "boolean",}, "nb_visits": {"type": "integer"}, "snapshot_id": {"type": "keyword"}, + "keywords": {"type": "keyword"}, + "description": {"type": "keyword"}, + # description is often an array. Will ES handle that ? "last_visit_date": {"type": "date"}, "last_eventful_visit_date": {"type": "date"}, "last_release_date": {"type": "date"}, @@ -335,6 +338,7 @@ min_last_release_date: str = "", programming_languages: List[str] = [], licenses: List[str] = [], + keywords: str = "", page_token: Optional[str] = None, sort_by: List[str] = [], limit: int = 50, @@ -431,6 +435,32 @@ } } ) + if keywords: + query_clauses.append( + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": " ".join(keywords), + "fields": [ + ( + "intrinsic_metadata" + ".http://schema.org/keywords" + ".@value" + "^2" + ), + ( + "intrinsic_metadata" + ".http://schema.org/description" + ".@value" + ), + ], + } + }, + } + } + ) if licenses or programming_languages: diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -104,6 +104,12 @@ field = field[1:] reversed = True + if field == "score": + if reversed: + return -origin.get(field, 0) + else: + return origin.get(field, 0) + datetime_max = datetime.max.replace(tzinfo=timezone.utc) if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer @@ -248,6 +254,7 @@ min_last_release_date: str = "", programming_languages: List[str] = [], licenses: List[str] = [], + keywords: List[str] = [], sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -386,6 +393,45 @@ ), hits, ) + if keywords: + METADATA_KEYWORDS = [ + "intrinsic_metadata", + "http://schema.org/keywords", + "@value", + ] + METADATA_DESCRIPTIONS = [ + "intrinsic_metadata", + "http://schema.org/description", + "@value", + ] + sort_by += ["-score"] + + from copy import deepcopy + + hits_list = deepcopy(list(hits)) + + def tokenize(x): + return x.lower().replace(",", " ").split() + + for origin in hits_list: + origin_keywords = [ + tokenize(keyword) + for keyword in _nested_get(origin, METADATA_KEYWORDS) + ] + origin_descriptions = [ + tokenize(description) + for description in _nested_get(origin, METADATA_DESCRIPTIONS) + ] + + for q_keyword in keywords: + for origin_keyword_tokens in origin_keywords: + if q_keyword in origin_keyword_tokens: + origin["score"] = origin.get("score", 0) + 2 + for origin_description_token in origin_descriptions: + if q_keyword in origin_description_token: + origin["score"] = origin.get("score", 0) + 1 + + hits = (origin for origin in hits_list if origin.get("score", 0) > 0) if visit_types is not None: visit_types_set = set(visit_types) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -74,6 +74,7 @@ min_last_release_date: str = "", programming_languages: List[str] = [], licenses: List[str] = [], + keywords: str = "", sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -101,6 +102,8 @@ (based on instrinsic_metadata) programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) + keywords: Filter origins having description/keywords + (extracted from instrinsic_metadata) that match given values sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -409,6 +409,59 @@ date_type="last_revision_date" ) + def test_origin_keywords_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "Django is a backend framework for applications", + "keywords": "django,backend,server,web", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "Native Android applications are fast", + "keywords": "android,mobile,ui", + }, + }, + { + "url": "http://foobar.3.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "React helps you build powerful web applications", + "keywords": "react,web,ui", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(keywords, origin_indices): + page = self.search.origin_search(url_pattern="foobar", keywords=keywords) + results = [r["url"] for r in page.results] + assert results == [ORIGINS[index]["url"] for index in origin_indices] + + _check_results(["powerful"], [2]) + + _check_results(["web"], [2, 0]) + _check_results(["ui"], [1, 2]) + + # baseline: "applications" is common in all origins + _check_results(["applications"], [2, 1, 0]) + # ORIGINS[1] has 'android' in both description and keyword so boosted + _check_results(["android", "applications"], [1, 2, 0]) + # 'ui' is present in keywords of both origins so they're boosted + _check_results(["ui", "applications"], [1, 2, 0]) + # difference from baseline : + # ORIGIN[2] has 'ui' in: keyword + description + # ORIGIN[0] has 'ui' in: keyword + # ORIGIN[1] has 'ui' in: None + # so they get boosted accordingly + _check_results("web apps", [2, 0, 1]) + def test_origin_sort_by_search(self): now = datetime.now(tz=timezone.utc).isoformat()