diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -18,6 +18,7 @@ MinimalOriginDict, OriginDict, PagedResult, + get_expansion, ) from swh.search.metrics import send_metric, timed @@ -333,6 +334,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: List[str] = [], licenses: List[str] = [], page_token: Optional[str] = None, @@ -431,40 +435,67 @@ } } ) + if min_date_created: + query_clauses.append( + { + "nested": { + "last_release_date": { + "gte": min_last_release_date.replace("Z", "+00:00"), + } + } + } + ) - if licenses or programming_languages: + intrinsic_metadata_filters: List[Dict[str, Dict]] = [] - license_filters = [] - for license in licenses: - license_filters.append( - { - "match": { - ( - "intrinsic_metadata" ".http://schema.org/license" ".@id" - ): license + if min_date_created: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_created", "."): { + "gte": min_date_created.replace("Z", "+00:00") + } + } + } + ) + if min_date_modified: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_modified", "."): { + "gte": min_date_modified.replace("Z", "+00:00") } } + } + ) + if min_date_published: + intrinsic_metadata_filters.append( + { + "range": { + get_expansion("date_published", "."): { + "gte": min_date_published.replace("Z", "+00:00") + } + } + } + ) + + if licenses: + license_filters: List[Dict[str, Any]] = [] + for license in licenses: + license_filters.append( + {"match": {get_expansion("licenses", "."): license}} ) + intrinsic_metadata_filters.append({"bool": {"should": license_filters}}) + if programming_languages: language_filters = [] for language in programming_languages: language_filters.append( - { - "match": { - ( - "intrinsic_metadata" - ".http://schema.org/programmingLanguage" - ".@value" - ): language - } - } + {"match": {get_expansion("programming_languages", "."): language}} ) + intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) - intrinsic_metadata_filters = [ - {"bool": {"should": license_filters}}, - {"bool": {"should": language_filters}}, - ] - + if intrinsic_metadata_filters: query_clauses.append( { "nested": { diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -15,6 +15,7 @@ MinimalOriginDict, OriginDict, PagedResult, + get_expansion, ) _words_regexp = re.compile(r"\w+") @@ -38,7 +39,7 @@ return extract(d, values) -def _nested_get(nested_dict, nested_keys): +def _nested_get(nested_dict, nested_keys, default=""): """Extracts values from deeply nested dictionary nested_dict using the nested_keys and returns a list of all of the values discovered in the process. @@ -74,7 +75,7 @@ ] # If value isn't a list or string or integer elif type_curr_obj != str and type_curr_obj != int: - return "" + return default # If only one element is present in the list, take it out # This ensures a flat array every time @@ -112,6 +113,21 @@ else: return origin.get(field, 0) + elif field in ["date_created", "date_modified", "date_published"]: + + if reversed: + return datetime_max - datetime.fromisoformat( + _nested_get(origin, get_expansion(field), "0001-01-01T00:00:00Z")[ + 0 + ].replace("Z", "+00:00") + ) + else: + return datetime.fromisoformat( + _nested_get(origin, get_expansion(field), "0001-01-01T00:00:00Z")[ + 0 + ].replace("Z", "+00:00") + ) + elif field in SORT_BY_OPTIONS: if reversed: return datetime_max - datetime.fromisoformat( @@ -246,6 +262,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: List[str] = [], licenses: List[str] = [], sort_by: List[str] = [], @@ -344,6 +363,36 @@ >= datetime.fromisoformat(min_last_release_date), hits, ) + if min_date_created: + + hits = filter( + lambda o: datetime.fromisoformat( + _nested_get(o, get_expansion("date_created"))[0] + ) + >= datetime.fromisoformat(min_date_created), + hits, + ) + if min_date_modified: + hits = filter( + lambda o: datetime.fromisoformat( + _nested_get( + o, get_expansion("date_modified"), "0001-01-01T00:00:00Z" + )[0].replace("Z", "+00:00") + ) + >= datetime.fromisoformat(min_date_modified), + hits, + ) + if min_date_published: + hits = filter( + lambda o: datetime.fromisoformat( + _nested_get( + o, get_expansion("date_published"), "0001-01-01T00:00:00Z" + )[0].replace("Z", "+00:00") + ) + >= datetime.fromisoformat(min_date_published), + hits, + ) + if licenses: METADATA_LICENSES = [ "intrinsic_metadata", diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -19,9 +19,43 @@ "last_eventful_visit_date", "last_revision_date", "last_release_date", + "date_created", + "date_modified", + "date_published", ] +def get_expansion(field, sep=None): + METADATA_FIELDS = { + "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], + "programming_languages": [ + "intrinsic_metadata", + "http://schema.org/programmingLanguage", + "@value", + ], + "date_created": [ + "intrinsic_metadata", + "http://schema.org/dateCreated", + "@value", + ], + "date_modified": [ + "intrinsic_metadata", + "http://schema.org/dateModified", + "@value", + ], + "date_published": [ + "intrinsic_metadata", + "http://schema.org/datePublished", + "@value", + ], + } + + if sep: + return sep.join(METADATA_FIELDS[field]) + + return METADATA_FIELDS[field] + + class MinimalOriginDict(TypedDict): """Mandatory keys of an :class:`OriginDict`""" @@ -72,6 +106,9 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + min_date_created: str = "", + min_date_modified: str = "", + min_date_published: str = "", programming_languages: List[str] = [], licenses: List[str] = [], sort_by: List[str] = [], @@ -97,6 +134,12 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + min_date_created: Filter origins that have date_created + from intrinsic_metadata on or after the provided date + min_date_modified: Filter origins that have date_modified + from intrinsic_metadata on or after the provided date + min_date_published: Filter origins that have date_published + from intrinsic_metadata on or after the provided date licenses: Filter origins with licenses present in the given list (based on instrinsic_metadata) programming_languages: Filter origins with programming languages