diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -18,6 +18,7 @@ MinimalOriginDict, OriginDict, PagedResult, + get_expansion, ) from swh.search.metrics import send_metric, timed @@ -333,10 +334,11 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", - programming_languages: List[str] = [], - licenses: List[str] = [], + programming_languages: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, + keywords: Optional[List[str]] = None, + sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, - sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] @@ -431,40 +433,46 @@ } } ) + if keywords: + query_clauses.append( + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": " ".join(keywords), + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + # "^2" boosts an origin's score by 2x + # if it the queried keywords are + # found in its intrinsic_metadata.keywords + ], + } + }, + } + } + ) - if licenses or programming_languages: + intrinsic_metadata_filters: List[Dict[str, Dict]] = [] - license_filters = [] + if licenses: + license_filters: List[Dict[str, Any]] = [] for license in licenses: license_filters.append( - { - "match": { - ( - "intrinsic_metadata" ".http://schema.org/license" ".@id" - ): license - } - } + {"match": {get_expansion("licenses", "."): license}} ) + intrinsic_metadata_filters.append({"bool": {"should": license_filters}}) - language_filters = [] + if programming_languages: + language_filters: List[Dict[str, Any]] = [] for language in programming_languages: language_filters.append( - { - "match": { - ( - "intrinsic_metadata" - ".http://schema.org/programmingLanguage" - ".@value" - ): language - } - } + {"match": {get_expansion("programming_languages", "."): language}} ) + intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) - intrinsic_metadata_filters = [ - {"bool": {"should": license_filters}}, - {"bool": {"should": language_filters}}, - ] - + if intrinsic_metadata_filters: query_clauses.append( { "nested": { @@ -488,14 +496,15 @@ sorting_params = [] - for field in sort_by: - order = "asc" - if field and field[0] == "-": - field = field[1:] - order = "desc" + if sort_by: + for field in sort_by: + order = "asc" + if field and field[0] == "-": + field = field[1:] + order = "desc" - if field in SORT_BY_OPTIONS: - sorting_params.append({field: order}) + if field in SORT_BY_OPTIONS: + sorting_params.append({field: order}) sorting_params.extend( [{"_score": "desc"}, {"sha1": "asc"},] diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -15,6 +15,7 @@ MinimalOriginDict, OriginDict, PagedResult, + get_expansion, ) _words_regexp = re.compile(r"\w+") @@ -92,6 +93,10 @@ return res +def _tokenize(x): + return x.lower().replace(",", " ").split() + + def _get_sorting_key(origin, field): """Get value of the field from an origin for sorting origins. @@ -104,6 +109,12 @@ field = field[1:] reversed = True + if field == "score": + if reversed: + return -origin.get(field, 0) + else: + return origin.get(field, 0) + datetime_max = datetime.max.replace(tzinfo=timezone.utc) if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer @@ -240,15 +251,16 @@ metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, - page_token: Optional[str] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", - programming_languages: List[str] = [], - licenses: List[str] = [], - sort_by: List[str] = [], + programming_languages: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, + keywords: Optional[List[str]] = None, + sort_by: Optional[List[str]] = None, + page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( @@ -344,13 +356,9 @@ >= datetime.fromisoformat(min_last_release_date), hits, ) + if licenses: - METADATA_LICENSES = [ - "intrinsic_metadata", - "http://schema.org/license", - "@id", - ] - licenses = [license_keyword.lower() for license_keyword in licenses] + queried_licenses = [license_keyword.lower() for license_keyword in licenses] hits = filter( lambda o: any( # If any of the queried licenses are found, include the origin @@ -358,19 +366,14 @@ # returns True if queried_license_keyword is found # in any of the licenses of the origin queried_license_keyword in origin_license - for origin_license in _nested_get(o, METADATA_LICENSES) + for origin_license in _nested_get(o, get_expansion("licenses")) ) - for queried_license_keyword in licenses + for queried_license_keyword in queried_licenses ), hits, ) if programming_languages: - METADATA_PROGRAMMING_LANGS = [ - "intrinsic_metadata", - "http://schema.org/programmingLanguage", - "@value", - ] - programming_languages = [ + queried_programming_languages = [ lang_keyword.lower() for lang_keyword in programming_languages ] hits = filter( @@ -380,12 +383,46 @@ # returns True if queried_lang_keyword is found # in any of the langs of the origin queried_lang_keyword in origin_lang - for origin_lang in _nested_get(o, METADATA_PROGRAMMING_LANGS) + for origin_lang in _nested_get( + o, get_expansion("programming_languages") + ) ) - for queried_lang_keyword in programming_languages + for queried_lang_keyword in queried_programming_languages ), hits, ) + if keywords: + + if sort_by: + sort_by.append("-score") + else: + sort_by = ["-score"] + + from copy import deepcopy + + hits_list = deepcopy(list(hits)) + + for origin in hits_list: + origin_keywords = [ + _tokenize(keyword) + for keyword in _nested_get(origin, get_expansion("keywords")) + ] + origin_descriptions = [ + _tokenize(description) + for description in _nested_get( + origin, get_expansion("descriptions") + ) + ] + + for q_keyword in keywords: + for origin_keyword_tokens in origin_keywords: + if q_keyword in origin_keyword_tokens: + origin["score"] = origin.get("score", 0) + 2 + for origin_description_token in origin_descriptions: + if q_keyword in origin_description_token: + origin["score"] = origin.get("score", 0) + 1 + + hits = (origin for origin in hits_list if origin.get("score", 0) > 0) if visit_types is not None: visit_types_set = set(visit_types) @@ -394,9 +431,14 @@ hits, ) - hits_list = sorted( - hits, key=lambda o: tuple(_get_sorting_key(o, field) for field in sort_by), - ) + hits_list = list(hits) + if sort_by: + sort_by_list = list(sort_by) + hits_list.sort( + key=lambda o: tuple( + _get_sorting_key(o, field) for field in sort_by_list + ) + ) start_at_index = int(page_token) if page_token else 0 diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -22,6 +22,28 @@ ] +def get_expansion(field, sep=None): + METADATA_FIELDS = { + "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], + "programming_languages": [ + "intrinsic_metadata", + "http://schema.org/programmingLanguage", + "@value", + ], + "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], + "descriptions": [ + "intrinsic_metadata", + "http://schema.org/description", + "@value", + ], + } + + if sep: + return sep.join(METADATA_FIELDS[field]) + + return METADATA_FIELDS[field] + + class MinimalOriginDict(TypedDict): """Mandatory keys of an :class:`OriginDict`""" @@ -66,26 +88,28 @@ metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, - page_token: Optional[str] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", - programming_languages: List[str] = [], - licenses: List[str] = [], - sort_by: List[str] = [], + programming_languages: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, + keywords: Optional[List[str]] = None, + sort_by: Optional[List[str]] = None, + page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. Args: url_pattern: Part of the URL to search for + metadata_pattern: Keywords to look for + (across all the fields of intrinsic_metadata) with_visit: Whether origins with no visit are to be filtered out visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned - page_token: Opaque value used for pagination min_nb_visits: Filter origins that have number of visits >= the provided value min_last_visit_date: Filter origins that have @@ -97,15 +121,18 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) - licenses: Filter origins with licenses present in the given list - (based on instrinsic_metadata) programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) + licenses: Filter origins with licenses present in the given list + (based on instrinsic_metadata) + keywords: Filter origins having description/keywords + (extracted from instrinsic_metadata) that match given values sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). Return results in descending order if "-" is present at the beginning otherwise in ascending order. + page_token: Opaque value used for pagination limit: number of results to return Returns: diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -409,6 +409,71 @@ date_type="last_revision_date" ) + def test_origin_keywords_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "Django is a backend framework for applications", + "keywords": "django,backend,server,web,framework", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "Native Android applications are fast", + "keywords": "android,mobile,ui", + }, + }, + { + "url": "http://foobar.3.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "React framework helps you build web applications", + "keywords": "react,web,ui", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(keywords, origin_indices, sorting=False): + page = self.search.origin_search(url_pattern="foobar", keywords=keywords) + results = [r["url"] for r in page.results] + if sorting: + assert sorted(results) == sorted( + [ORIGINS[index]["url"] for index in origin_indices] + ) + else: + assert results == [ORIGINS[index]["url"] for index in origin_indices] + + _check_results(["build"], [2]) + + _check_results(["web"], [2, 0]) + _check_results(["ui"], [1, 2]) + + # Following tests ensure that boosts work properly + + # Baseline: "applications" is common in all origin descriptions + _check_results(["applications"], [1, 0, 2], True) + + # ORIGINS[0] has 'framework' in: keyword + description + # ORIGINS[2] has 'framework' in: description + # ORIGINS[1] has 'framework' in: None + _check_results(["framework", "applications"], [0, 2, 1]) + + # ORIGINS[1] has 'ui' in: keyword + # ORIGINS[1] has 'ui' in: keyword + # ORIGINS[0] has 'ui' in: None + _check_results(["applications", "ui"], [1, 2, 0]) + + # ORIGINS[2] has 'web' in: keyword + description + # ORIGINS[0] has 'web' in: keyword + # ORIGINS[1] has 'web' in: None + _check_results(["web", "applications"], [2, 0, 1]) + def test_origin_sort_by_search(self): now = datetime.now(tz=timezone.utc).isoformat()