diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -48,6 +48,8 @@ "last_eventful_visit_date", "last_revision_date", "last_release_date", + "programming_language", + "license", ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -169,6 +171,8 @@ "last_eventful_visit_date": {"type": "date"}, "last_release_date": {"type": "date"}, "last_revision_date": {"type": "date"}, + "programming_language": {"type": "keyword"}, + "license": {"type": "keyword"}, "intrinsic_metadata": { "type": "nested", "properties": { @@ -333,6 +337,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_language: str = "", + license: str = "", page_token: Optional[str] = None, sort_by: List[str] = [], limit: int = 50, @@ -429,6 +435,12 @@ } } ) + if license: + query_clauses.append({"match": {"license": license,}}) + if programming_language: + query_clauses.append( + {"match": {"programming_language": programming_language,}} + ) if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -171,6 +171,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_language: str = "", + license: str = "", sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -267,6 +269,13 @@ >= datetime.fromisoformat(min_last_release_date), hits, ) + if license: + hits = filter(lambda o: o.get("license", "") == license, hits,) + if programming_language: + hits = filter( + lambda o: o.get("programming_language", "") == programming_language, + hits, + ) if visit_types is not None: visit_types_set = set(visit_types) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -72,6 +72,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_language: str = "", + license: str = "", sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -95,6 +97,10 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + license: Filter origins with given license + (extracted from instrinsic_metadata) + programming_language: Filter origins with given programming_language + (extracted from instrinsic_metadata) sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -16,6 +16,18 @@ } +def _nested_pop(nested_dict, nested_key, default=""): + try: + keys = nested_key.split(".") + dict_ref = nested_dict + for k in keys[:-1]: + dict_ref = dict_ref[k] + + return dict_ref.pop(keys[-1]) + except Exception: + return default + + def fetch_last_revision_release_date(snapshot_id, storage): if not snapshot_id: return {} @@ -113,7 +125,12 @@ logging.debug("processing origin intrinsic_metadata %r", origin_metadata) origin_metadata = [ - {"url": item["id"], "intrinsic_metadata": item["metadata"],} + { + "url": item["id"], + "license": _nested_pop(item, "metadata.license"), + "programming_language": _nested_pop(item, "metadata.programmingLanguage"), + "intrinsic_metadata": item["metadata"], + } for item in origin_metadata ] diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -254,6 +254,8 @@ "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", + "programmingLanguage": "python", + "license": "MIT", }, }, ] @@ -267,6 +269,8 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, + "programming_language": "python", + "license": "MIT", }, ] )