diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -18,9 +18,9 @@ MinimalOriginDict, OriginDict, PagedResult, - get_expansion, ) from swh.search.metrics import send_metric, timed +from swh.search.utils import get_expansion, is_date_parsable INDEX_NAME_PARAM = "index" READ_ALIAS_PARAM = "read_alias" @@ -66,23 +66,29 @@ # * {"author": [{"@value": "Jane Doe"}]} # and JSON-LD expansion will convert them all to the last one. if "intrinsic_metadata" in res: - res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) + intrinsic_metadata = res["intrinsic_metadata"] + for date_field in ["dateCreated", "dateModified", "datePublished"]: + if date_field in intrinsic_metadata: + date = intrinsic_metadata[date_field] + + # If date{Created,Modified,Published} value isn't parsable + # It gets rejected and isn't stored (unlike other fields) + if not is_date_parsable(date): + intrinsic_metadata.pop(date_field) + + res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) return res def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: - """Tokenize as string an index page result from a search - - """ + """Tokenize as string an index page result from a search""" page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: - """Read the page_token - - """ + """Read the page_token""" return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) @@ -177,7 +183,20 @@ # don't bother indexing tokens in these URIs, as the # are used as namespaces "type": "keyword", - } + }, + "http://schema": { + "properties": { + "org/dateCreated": { + "properties": {"@value": {"type": "date",}} + }, + "org/dateModified": { + "properties": {"@value": {"type": "date",}} + }, + "org/datePublished": { + "properties": {"@value": {"type": "date",}} + }, + } + }, }, }, # Has this origin been taken down? @@ -334,6 +353,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -378,6 +400,8 @@ # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], + # date{Created,Modified,Published} are of type date + "lenient": True, } }, } @@ -472,6 +496,33 @@ ) intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) + if min_date_created: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_created", "."): {"gte": min_date_created,} + } + } + ) + if min_date_modified: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_modified", "."): {"gte": min_date_modified,} + } + } + ) + if min_date_published: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_published", "."): { + "gte": min_date_published, + } + } + } + ) + if intrinsic_metadata_filters: query_clauses.append( { @@ -494,7 +545,7 @@ if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) - sorting_params = [] + sorting_params: List[Dict[str, Any]] = [] if sort_by: for field in sort_by: @@ -503,7 +554,16 @@ field = field[1:] order = "desc" - if field in SORT_BY_OPTIONS: + if field in ["date_created", "date_modified", "date_published"]: + sorting_params.append( + { + get_expansion(field, "."): { + "nested_path": "intrinsic_metadata", + "order": order, + } + } + ) + elif field in SORT_BY_OPTIONS: sorting_params.append({field: order}) sorting_params.extend( diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -15,8 +15,8 @@ MinimalOriginDict, OriginDict, PagedResult, - get_expansion, ) +from swh.search.utils import get_expansion, is_date_parsable _words_regexp = re.compile(r"\w+") @@ -39,7 +39,7 @@ return extract(d, values) -def _nested_get(nested_dict, nested_keys): +def _nested_get(nested_dict, nested_keys, default=""): """Extracts values from deeply nested dictionary nested_dict using the nested_keys and returns a list of all of the values discovered in the process. @@ -75,7 +75,7 @@ ] # If value isn't a list or string or integer elif type_curr_obj != str and type_curr_obj != int: - return "" + return default # If only one element is present in the list, take it out # This ensures a flat array every time @@ -84,7 +84,7 @@ return curr_obj except Exception: - return [] + return default res = _nested_get_recursive(nested_dict, nested_keys) if type(res) != list: @@ -109,29 +109,41 @@ field = field[1:] reversed = True + DATETIME_OBJ_MAX = datetime.max.replace(tzinfo=timezone.utc) + DATETIME_MIN = "0001-01-01T00:00:00Z" + + DATE_OBJ_MAX = datetime.max + DATE_MIN = "0001-01-01" + if field == "score": if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) - datetime_max = datetime.max.replace(tzinfo=timezone.utc) + if field in ["date_created", "date_modified", "date_published"]: + date = datetime.strptime( + _nested_get(origin, get_expansion(field), DATE_MIN)[0], "%Y-%m-%d" + ) + if reversed: + return DATE_OBJ_MAX - date + else: + return date - if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer + elif field in ["nb_visits"]: # unlike other options, nb_visits is of type integer if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) elif field in SORT_BY_OPTIONS: + date = datetime.fromisoformat( + origin.get(field, DATETIME_MIN).replace("Z", "+00:00") + ) if reversed: - return datetime_max - datetime.fromisoformat( - origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") - ) + return DATETIME_OBJ_MAX - date else: - return datetime.fromisoformat( - origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") - ) + return date class InMemorySearch: @@ -220,9 +232,18 @@ ), ).isoformat() if "intrinsic_metadata" in document: - document["intrinsic_metadata"] = codemeta.expand( - document["intrinsic_metadata"] - ) + intrinsic_metadata = document["intrinsic_metadata"] + + for date_field in ["dateCreated", "dateModified", "datePublished"]: + if date_field in intrinsic_metadata: + date = intrinsic_metadata[date_field] + + # If date{Created,Modified,Published} value isn't parsable + # It gets rejected and isn't stored (unlike other fields) + if not is_date_parsable(date): + intrinsic_metadata.pop(date_field) + + document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) if len(document["intrinsic_metadata"]) != 1: continue @@ -256,6 +277,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -357,6 +381,34 @@ hits, ) + if min_date_created: + min_date_created_obj = datetime.strptime(min_date_created, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_created"))[0], "%Y-%m-%d" + ) + >= min_date_created_obj, + hits, + ) + if min_date_modified: + min_date_modified_obj = datetime.strptime(min_date_modified, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_modified"))[0], "%Y-%m-%d" + ) + >= min_date_modified_obj, + hits, + ) + if min_date_published: + min_date_published_obj = datetime.strptime(min_date_published, "%Y-%m-%d") + hits = filter( + lambda o: datetime.strptime( + _nested_get(o, get_expansion("date_published"))[0], "%Y-%m-%d" + ) + >= min_date_published_obj, + hits, + ) + if licenses: queried_licenses = [license_keyword.lower() for license_keyword in licenses] hits = filter( diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -19,31 +19,12 @@ "last_eventful_visit_date", "last_revision_date", "last_release_date", + "date_created", + "date_modified", + "date_published", ] -def get_expansion(field, sep=None): - METADATA_FIELDS = { - "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], - "programming_languages": [ - "intrinsic_metadata", - "http://schema.org/programmingLanguage", - "@value", - ], - "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], - "descriptions": [ - "intrinsic_metadata", - "http://schema.org/description", - "@value", - ], - } - - if sep: - return sep.join(METADATA_FIELDS[field]) - - return METADATA_FIELDS[field] - - class MinimalOriginDict(TypedDict): """Mandatory keys of an :class:`OriginDict`""" @@ -93,6 +74,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, @@ -121,6 +105,12 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + min_date_created: Filter origins that have date_created + from intrinsic_metadata on or after the provided date + min_date_modified: Filter origins that have date_modified + from intrinsic_metadata on or after the provided date + min_date_published: Filter origins that have date_published + from intrinsic_metadata on or after the provided date programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) licenses: Filter origins with licenses present in the given list diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -409,6 +409,74 @@ date_type="last_revision_date" ) + def test_origin_instrinsic_metadata_dates_filter_sorting_search(self): + + DATE_0 = "1999-06-28" + DATE_1 = "2001-02-13" + DATE_2 = "2005-10-02" + + ORIGINS = [ + { + "url": "http://foobar.0.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_0, + "dateModified": DATE_1, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_1, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_2, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(origin_indices, sort_results=True, **kwargs): + page = self.search.origin_search(url_pattern="foobar", **kwargs) + results = [r["url"] for r in page.results] + if sort_results: + assert sorted(results) == sorted( + [ORIGINS[index]["url"] for index in origin_indices] + ) + else: + assert results == [ORIGINS[index]["url"] for index in origin_indices] + + _check_results(min_date_created=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_created=DATE_1, origin_indices=[1, 2]) + _check_results(min_date_created=DATE_2, origin_indices=[2]) + + _check_results(min_date_modified=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_modified=DATE_1, origin_indices=[0, 1, 2]) + _check_results(min_date_modified=DATE_2, origin_indices=[1, 2]) + + _check_results(min_date_published=DATE_0, origin_indices=[0, 1, 2]) + _check_results(min_date_published=DATE_1, origin_indices=[0, 1, 2]) + _check_results(min_date_published=DATE_2, origin_indices=[0, 1, 2]) + + # Sorting + _check_results( + sort_by=["-date_created"], origin_indices=[2, 1, 0], sort_results=False + ) + _check_results( + sort_by=["date_created"], origin_indices=[0, 1, 2], sort_results=False + ) + def test_origin_keywords_search(self): ORIGINS = [ { @@ -914,13 +982,15 @@ ) self.search.flush() - actual_page = self.search.origin_search(metadata_pattern="2021") + actual_page = self.search.origin_search(metadata_pattern="1.0") assert actual_page.next_page_token is None assert actual_page.results == [origin1] - actual_page = self.search.origin_search(metadata_pattern="long time ago") + actual_page = self.search.origin_search(metadata_pattern="long") assert actual_page.next_page_token is None - assert actual_page.results == [origin2] + assert ( + actual_page.results == [] + ) # "%Y-%m-%d" not followed, so value is rejected actual_page = self.search.origin_search(metadata_pattern="true") assert actual_page.next_page_token is None diff --git a/swh/search/utils.py b/swh/search/utils.py new file mode 100644 --- /dev/null +++ b/swh/search/utils.py @@ -0,0 +1,57 @@ +from datetime import datetime + +import iso8601 # type: ignore + + +def get_expansion(field, sep=None): + METADATA_FIELDS = { + "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], + "programming_languages": [ + "intrinsic_metadata", + "http://schema.org/programmingLanguage", + "@value", + ], + "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], + "descriptions": [ + "intrinsic_metadata", + "http://schema.org/description", + "@value", + ], + "date_created": [ + "intrinsic_metadata", + "http://schema.org/dateCreated", + "@value", + ], + "date_modified": [ + "intrinsic_metadata", + "http://schema.org/dateModified", + "@value", + ], + "date_published": [ + "intrinsic_metadata", + "http://schema.org/datePublished", + "@value", + ], + } + + if sep: + return sep.join(METADATA_FIELDS[field]) + + return METADATA_FIELDS[field] + + +def is_date_parsable(date_str): + """ + Return True if date_str is in the format + %Y-%m-%d or the standard ISO format. + Otherwise return False. + """ + try: + datetime.strptime(date_str, "%Y-%m-%d") + return True + except Exception: + try: + iso8601.parse_date(date_str) + return True + except Exception: + return False