diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -333,6 +333,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], page_token: Optional[str] = None, sort_by: List[str] = [], limit: int = 50, @@ -430,6 +432,57 @@ } ) + if licenses or programming_languages: + + license_filters = [] + for license in licenses: + license_filters.append( + { + "match": { + ( + "intrinsic_metadata" ".http://schema.org/license" ".@id" + ): license + } + } + ) + + language_filters = [] + for language in programming_languages: + language_filters.append( + { + "match": { + ( + "intrinsic_metadata" + ".http://schema.org/programmingLanguage" + ".@value" + ): language + } + } + ) + + intrinsic_metadata_filters = [ + {"bool": {"should": license_filters}}, + {"bool": {"should": language_filters}}, + ] + + query_clauses.append( + { + "nested": { + "path": "intrinsic_metadata", + "query": {"bool": {"must": intrinsic_metadata_filters,}}, + # "must" is equivalent to "AND" + # "should" is equivalent to "OR" + # Resulting origins must return true for the following: + # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) + # This is equivalent to {"must": [ + # {"should": [license_1,license_2] }, + # {"should": [lang_1,lang_2]}] } + # ]} + # Note: Usage of "bool" has been omitted for readability + } + } + ) + if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -8,6 +8,7 @@ import re from typing import Any, Dict, Iterable, Iterator, List, Optional +from swh.indexer import codemeta from swh.model.identifiers import origin_identifier from swh.search.interface import ( SORT_BY_OPTIONS, @@ -37,6 +38,60 @@ return extract(d, values) +def _nested_get(nested_dict, nested_keys): + """Extracts values from deeply nested dictionary nested_dict + using the nested_keys and returns a list of all of the values + discovered in the process. + + + >>> nested_dict = [ + ... {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, + ... {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, + ... ] + >>> _nested_get(nested_dict, ["name", "@value", "last"]) + ['l1', 'l2'] + >>> _nested_get(nested_dict, ["address"]) + ['XYZ', 'ABC'] + + It doesn't allow fetching intermediate values and returns "" for such cases + >>> _nested_get(nested_dict, ["name", "@value"]) + ['', ''] + """ + + def _nested_get_recursive(nested_dict, nested_keys): + try: + curr_obj = nested_dict + type_curr_obj = type(curr_obj) + for i, key in enumerate(nested_keys): + if key in curr_obj: + curr_obj = curr_obj[key] + type_curr_obj = type(curr_obj) + else: + if type_curr_obj == list: + curr_obj = [ + _nested_get_recursive(obj, nested_keys[i:]) + for obj in curr_obj + ] + # If value isn't a list or string or integer + elif type_curr_obj != str and type_curr_obj != int: + return "" + + # If only one element is present in the list, take it out + # This ensures a flat array every time + if type_curr_obj == list and len(curr_obj) == 1: + curr_obj = curr_obj[0] + + return curr_obj + except Exception: + return [] + + res = _nested_get_recursive(nested_dict, nested_keys) + if type(res) != list: + return [res] + + return res + + def _get_sorting_key(origin, field): """Get value of the field from an origin for sorting origins. @@ -153,6 +208,26 @@ .replace("Z", "+00:00") ), ).isoformat() + if "intrinsic_metadata" in document: + document["intrinsic_metadata"] = codemeta.expand( + document["intrinsic_metadata"] + ) + + if len(document["intrinsic_metadata"]) != 1: + continue + + metadata = document["intrinsic_metadata"][0] + if "http://schema.org/license" in metadata: + metadata["http://schema.org/license"] = [ + {"@id": license["@id"].lower()} + for license in metadata["http://schema.org/license"] + ] + if "http://schema.org/programmingLanguage" in metadata: + metadata["http://schema.org/programmingLanguage"] = [ + {"@value": license["@value"].lower()} + for license in metadata["http://schema.org/programmingLanguage"] + ] + self._origins[id_].update(document) if id_ not in self._origin_ids: @@ -171,6 +246,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -267,6 +344,48 @@ >= datetime.fromisoformat(min_last_release_date), hits, ) + if licenses: + METADATA_LICENSES = [ + "intrinsic_metadata", + "http://schema.org/license", + "@id", + ] + licenses = [license_keyword.lower() for license_keyword in licenses] + hits = filter( + lambda o: any( + # If any of the queried licenses are found, include the origin + any( + # returns True if queried_license_keyword is found + # in any of the licenses of the origin + queried_license_keyword in origin_license + for origin_license in _nested_get(o, METADATA_LICENSES) + ) + for queried_license_keyword in licenses + ), + hits, + ) + if programming_languages: + METADATA_PROGRAMMING_LANGS = [ + "intrinsic_metadata", + "http://schema.org/programmingLanguage", + "@value", + ] + programming_languages = [ + lang_keyword.lower() for lang_keyword in programming_languages + ] + hits = filter( + lambda o: any( + # If any of the queried languages are found, include the origin + any( + # returns True if queried_lang_keyword is found + # in any of the langs of the origin + queried_lang_keyword in origin_lang + for origin_lang in _nested_get(o, METADATA_PROGRAMMING_LANGS) + ) + for queried_lang_keyword in programming_languages + ), + hits, + ) if visit_types is not None: visit_types_set = set(visit_types) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -72,6 +72,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -95,6 +97,10 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + licenses: Filter origins with licenses present in the given list + (based on instrinsic_metadata) + programming_languages: Filter origins with programming languages + present in the given list (based on instrinsic_metadata) sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py --- a/swh/search/tests/test_in_memory.py +++ b/swh/search/tests/test_in_memory.py @@ -7,7 +7,9 @@ import pytest +from swh.indexer import codemeta from swh.search import get_search +from swh.search.in_memory import _nested_get from .test_search import CommonSearchTest @@ -23,3 +25,48 @@ def reset(self): self.search.deinitialize() self.search.initialize() + + +def test_nested_get_helper_function(): + + instrinsic_metadata = codemeta.expand( + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["bar", "baz"], + "description": "foo bar 3", + "programmingLanguage": "cpp", + "license": "https://spdx.org/licenses/LGPL-2.0-only", + } + ) + expected_expansion = [ + { + "http://schema.org/description": [{"@value": "foo bar 3"}], + "http://schema.org/license": [ + {"@id": "https://spdx.org/licenses/LGPL-2.0-only"} + ], + "http://schema.org/keywords": [{"@value": "bar"}, {"@value": "baz"}], + "http://schema.org/programmingLanguage": [{"@value": "cpp"}], + } + ] + assert instrinsic_metadata == expected_expansion + assert _nested_get(instrinsic_metadata, ["http://schema.org/license", "@id"]) == [ + "https://spdx.org/licenses/LGPL-2.0-only" + ] + + new_field = [ + {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, + {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, + {"name": [{"@value": {"first": "f3"}}], "address": {}}, + {"name": [{"@value": {"first": "f4"}}], "address": []}, + ] + assert _nested_get(new_field, ["name", "@value", "last"]) == ["l1", "l2", "", ""] + assert _nested_get(new_field, ["name", "@value", "first"]) == [ + "f1", + "f2", + "f3", + "f4", + ] + + assert _nested_get(new_field, ["address"]) == ["XYZ", "ABC", {}, []] + # shouldn't allow fetching intermediate values + assert _nested_get(new_field, ["name", "@value"]) == ["", "", "", ""] diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -254,6 +254,8 @@ "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", + "programmingLanguage": "python", + "license": "MIT", }, }, ] @@ -266,6 +268,8 @@ "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", + "programmingLanguage": "python", + "license": "MIT", }, }, ] diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -449,6 +449,129 @@ _check_results(["nb_visits", "-last_visit_date"], ORIGINS) _check_results(["-last_visit_date", "nb_visits"], ORIGINS[::-1]) + def test_origin_instrinsic_metadata_license_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "license": "https://spdx.org/licenses/MIT", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "license": "BSD-3-Clause", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(licenses, origin_indices): + page = self.search.origin_search(url_pattern="foobar", licenses=licenses) + results = [r["url"] for r in page.results] + assert sorted(results) == sorted( + [ORIGINS[i]["url"] for i in origin_indices] + ) + + _check_results(["MIT"], [0]) + _check_results(["bsd"], [1]) + _check_results(["mit", "3-Clause"], [0, 1]) + + def test_origin_instrinsic_metadata_programming_language_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "programmingLanguage": "python", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "programmingLanguage": "javascript", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_results(programming_languages, origin_indices): + page = self.search.origin_search( + url_pattern="foobar", programming_languages=programming_languages + ) + results = [r["url"] for r in page.results] + assert sorted(results) == sorted( + [ORIGINS[i]["url"] for i in origin_indices] + ) + + _check_results(["python"], [0]) + _check_results(["javascript"], [1]) + _check_results(["python", "javascript"], [0, 1]) + + def test_origin_instrinsic_metadata_multiple_field_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar 1", + "programmingLanguage": "python", + "license": "https://spdx.org/licenses/MIT", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar 2", + "programmingLanguage": ["javascript", "html", "css"], + "license": [ + "https://spdx.org/licenses/CC-BY-1.0", + "https://spdx.org/licenses/Apache-1.0", + ], + }, + }, + { + "url": "http://foobar.3.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar 3", + "programmingLanguage": ["Cpp", "c"], + "license": "https://spdx.org/licenses/LGPL-2.0-only", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + def _check_result(programming_languages, licenses, origin_indices): + page = self.search.origin_search( + url_pattern="foobar", + programming_languages=programming_languages, + licenses=licenses, + ) + results = [r["url"] for r in page.results] + assert sorted(results) == sorted( + [ORIGINS[i]["url"] for i in origin_indices] + ) + + _check_result(["javascript"], ["CC"], [1]) + _check_result(["css"], ["CC"], [1]) + _check_result(["css"], ["CC", "apache"], [1]) + + _check_result(["python", "javascript"], ["MIT"], [0]) + + _check_result(["c", "python"], ["LGPL", "mit"], [2, 0]) + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types, diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -7,8 +7,9 @@ deps = pytest-cov commands = - pytest --cov={envsitepackagesdir}/swh/search \ + pytest --doctest-modules \ {envsitepackagesdir}/swh/search \ + --cov={envsitepackagesdir}/swh/search \ --cov-branch {posargs} [testenv:black]