diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -18,9 +18,9 @@ MinimalOriginDict, OriginDict, PagedResult, - get_expansion, ) from swh.search.metrics import send_metric, timed +from swh.search.utils import get_expansion INDEX_NAME_PARAM = "index" READ_ALIAS_PARAM = "read_alias" @@ -72,17 +72,13 @@ def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: - """Tokenize as string an index page result from a search - - """ + """Tokenize as string an index page result from a search""" page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: - """Read the page_token - - """ + """Read the page_token""" return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) @@ -177,7 +173,20 @@ # don't bother indexing tokens in these URIs, as the # are used as namespaces "type": "keyword", - } + }, + # "http://schema": { + # "properties": { + # "org/dateCreated": { + # "properties": {"@value": {"type": "date"}} + # }, + # "org/dateModified": { + # "properties": {"@value": {"type": "date"}} + # }, + # "org/datePublished": { + # "properties": {"@value": {"type": "date"}} + # }, + # } + # }, }, }, # Has this origin been taken down? @@ -334,6 +343,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -472,6 +484,33 @@ ) intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) + if min_date_created: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_created", "."): {"gte": min_date_created,} + } + } + ) + if min_date_modified: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_modified", "."): {"gte": min_date_modified,} + } + } + ) + if min_date_published: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_published", "."): { + "gte": min_date_published, + } + } + } + ) + if intrinsic_metadata_filters: query_clauses.append( { @@ -494,7 +533,7 @@ if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) - sorting_params = [] + sorting_params: List[Dict[str, Any]] = [] if sort_by: for field in sort_by: @@ -503,7 +542,16 @@ field = field[1:] order = "desc" - if field in SORT_BY_OPTIONS: + if field in ["date_created", "date_modified", "date_published"]: + sorting_params.append( + { + get_expansion(field, "."): { + "nested_path": "intrinsic_metadata", + "order": order, + } + } + ) + elif field in SORT_BY_OPTIONS: sorting_params.append({field: order}) sorting_params.extend( diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -15,8 +15,8 @@ MinimalOriginDict, OriginDict, PagedResult, - get_expansion, ) +from swh.search.utils import get_expansion _words_regexp = re.compile(r"\w+") @@ -39,7 +39,7 @@ return extract(d, values) -def _nested_get(nested_dict, nested_keys): +def _nested_get(nested_dict, nested_keys, default=""): """Extracts values from deeply nested dictionary nested_dict using the nested_keys and returns a list of all of the values discovered in the process. @@ -75,7 +75,7 @@ ] # If value isn't a list or string or integer elif type_curr_obj != str and type_curr_obj != int: - return "" + return default # If only one element is present in the list, take it out # This ensures a flat array every time @@ -84,7 +84,7 @@ return curr_obj except Exception: - return [] + return default res = _nested_get_recursive(nested_dict, nested_keys) if type(res) != list: @@ -109,15 +109,28 @@ field = field[1:] reversed = True + DATETIME_MAX = datetime.max.replace(tzinfo=timezone.utc) + DATETIME_MIN = "0001-01-01T00:00:00Z" + + DATE_MAX = datetime.max + if field == "score": if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) - datetime_max = datetime.max.replace(tzinfo=timezone.utc) + if field in ["date_created", "date_modified", "date_published"]: + if reversed: + return DATE_MAX - datetime.strptime( + _nested_get(origin, get_expansion(field), "0001-01-01")[0], "%Y-%m-%d" + ) + else: + return datetime.strptime( + _nested_get(origin, get_expansion(field), "0001-01-01")[0], "%Y-%m-%d" + ) - if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer + elif field in ["nb_visits"]: # unlike other options, nb_visits is of type integer if reversed: return -origin.get(field, 0) else: @@ -125,12 +138,12 @@ elif field in SORT_BY_OPTIONS: if reversed: - return datetime_max - datetime.fromisoformat( - origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") + return DATETIME_MAX - datetime.fromisoformat( + origin.get(field, DATETIME_MIN).replace("Z", "+00:00") ) else: return datetime.fromisoformat( - origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") + origin.get(field, DATETIME_MIN).replace("Z", "+00:00") ) @@ -256,6 +269,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -357,6 +373,34 @@ hits, ) + if min_date_created: + min_date_created_obj = datetime.strptime(min_date_created, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_created"))[0], "%Y-%m-%d" + ) + >= min_date_created_obj, + hits, + ) + if min_date_modified: + min_date_modified_obj = datetime.strptime(min_date_modified, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_modified"))[0], "%Y-%m-%d" + ) + >= min_date_modified_obj, + hits, + ) + if min_date_published: + min_date_published_obj = datetime.strptime(min_date_published, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_published"))[0], "%Y-%m-%d" + ) + >= min_date_published_obj, + hits, + ) + if licenses: queried_licenses = [license_keyword.lower() for license_keyword in licenses] hits = filter( diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -19,31 +19,12 @@ "last_eventful_visit_date", "last_revision_date", "last_release_date", + "date_created", + "date_modified", + "date_published", ] -def get_expansion(field, sep=None): - METADATA_FIELDS = { - "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], - "programming_languages": [ - "intrinsic_metadata", - "http://schema.org/programmingLanguage", - "@value", - ], - "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], - "descriptions": [ - "intrinsic_metadata", - "http://schema.org/description", - "@value", - ], - } - - if sep: - return sep.join(METADATA_FIELDS[field]) - - return METADATA_FIELDS[field] - - class MinimalOriginDict(TypedDict): """Mandatory keys of an :class:`OriginDict`""" @@ -93,6 +74,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -121,6 +105,12 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + min_date_created: Filter origins that have date_created + from intrinsic_metadata on or after the provided date + min_date_modified: Filter origins that have date_modified + from intrinsic_metadata on or after the provided date + min_date_published: Filter origins that have date_published + from intrinsic_metadata on or after the provided date programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) licenses: Filter origins with licenses present in the given list diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -409,6 +409,71 @@ date_type="last_revision_date" ) + def test_origin_instrinsic_metadata_dates_filter_sorting_search(self): + + DATE_0 = "1999-06-28" + DATE_1 = "2001-02-13" + DATE_2 = "2005-10-02" + + ORIGINS = [ + { + "url": "http://foobar.0.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_0, + "dateModified": DATE_1, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_1, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_2, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(origin_indices, sort_results=True, **kwargs): + page = self.search.origin_search(url_pattern="foobar", **kwargs) + results = [r["url"] for r in page.results] + if sort_results: + assert sorted(results) == sorted( + [ORIGINS[index]["url"] for index in origin_indices] + ) + else: + assert results == [ORIGINS[index]["url"] for index in origin_indices] + + _check_results(min_date_created=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_created=DATE_1, origin_indices=[1, 2]) + _check_results(min_date_created=DATE_2, origin_indices=[2]) + + _check_results(min_date_modified=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_modified=DATE_1, origin_indices=[0, 1, 2]) + _check_results(min_date_modified=DATE_2, origin_indices=[1, 2]) + + _check_results(min_date_published=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_published=DATE_1, origin_indices=[0, 1, 2]) + _check_results(min_date_published=DATE_2, origin_indices=[0, 1, 2]) + + # Sorting + _check_results( + sort_by=["-date_created"], origin_indices=[2, 1, 0], sort_results=False + ) + def test_origin_keywords_search(self): ORIGINS = [ { diff --git a/swh/search/utils.py b/swh/search/utils.py new file mode 100644 --- /dev/null +++ b/swh/search/utils.py @@ -0,0 +1,35 @@ +def get_expansion(field, sep=None): + METADATA_FIELDS = { + "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], + "programming_languages": [ + "intrinsic_metadata", + "http://schema.org/programmingLanguage", + "@value", + ], + "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], + "descriptions": [ + "intrinsic_metadata", + "http://schema.org/description", + "@value", + ], + "date_created": [ + "intrinsic_metadata", + "http://schema.org/dateCreated", + "@value", + ], + "date_modified": [ + "intrinsic_metadata", + "http://schema.org/dateModified", + "@value", + ], + "date_published": [ + "intrinsic_metadata", + "http://schema.org/datePublished", + "@value", + ], + } + + if sep: + return sep.join(METADATA_FIELDS[field]) + + return METADATA_FIELDS[field]