diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -333,6 +333,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], page_token: Optional[str] = None, sort_by: List[str] = [], limit: int = 50, @@ -430,6 +432,45 @@ } ) + if licenses or programming_languages: + intrinsic_metadata_fields = [] + + if licenses: + for license in licenses: + intrinsic_metadata_fields.append( + { + "match": { + ( + "intrinsic_metadata" + ".http://schema.org/license" + ".@id" + ): license + } + } + ) + if programming_languages: + for language in programming_languages: + intrinsic_metadata_fields.append( + { + "match": { + ( + "intrinsic_metadata" + ".http://schema.org/programmingLanguage" + ".@value" + ): language + } + } + ) + + query_clauses.append( + { + "nested": { + "path": "intrinsic_metadata", + "query": {"bool": {"should": intrinsic_metadata_fields,}}, + } + } + ) + if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -171,6 +171,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -267,6 +269,30 @@ >= datetime.fromisoformat(min_last_release_date), hits, ) + if licenses: + licenses = [license.lower() for license in licenses] + hits = filter( + lambda o: any( + license + in o.get("intrinsic_metadata", {}).get("license", "").lower() + for license in licenses + ), + hits, + ) + if programming_languages: + programming_languages = [ + language.lower() for language in programming_languages + ] + hits = filter( + lambda o: any( + language + in o.get("intrinsic_metadata", {}) + .get("programmingLanguage", "") + .lower() + for language in programming_languages + ), + hits, + ) if visit_types is not None: visit_types_set = set(visit_types) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -72,6 +72,8 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + programming_languages: List[str] = [], + licenses: List[str] = [], sort_by: List[str] = [], limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -95,6 +97,10 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + licenses: Filter origins with licenses present in the given list + (based on instrinsic_metadata) + programming_languages: Filter origins with programming languages + present in the given list (based on instrinsic_metadata) sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -254,6 +254,8 @@ "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", + "programmingLanguage": "python", + "license": "MIT", }, }, ] @@ -266,6 +268,8 @@ "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", + "programmingLanguage": "python", + "license": "MIT", }, }, ] diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -449,6 +449,82 @@ _check_results(["nb_visits", "-last_visit_date"], ORIGINS) _check_results(["-last_visit_date", "nb_visits"], ORIGINS[::-1]) + def test_origin_instrinsic_metadata_license_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "license": "https://spdx.org/licenses/MIT", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "license": "BSD-3-Clause", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + page = self.search.origin_search(url_pattern="foobar", licenses=["MIT"]) + results = [r["url"] for r in page.results] + assert results == [ORIGINS[0]["url"]] + + page = self.search.origin_search(url_pattern="foobar", licenses=["bsd"]) + results = [r["url"] for r in page.results] + assert results == [ORIGINS[1]["url"]] + + page = self.search.origin_search( + url_pattern="foobar", licenses=["mit", "3-Clause"] + ) + results = [r["url"] for r in page.results] + assert sorted(results) == sorted([o["url"] for o in ORIGINS]) + + def test_origin_instrinsic_metadata_programming_language_search(self): + ORIGINS = [ + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "programmingLanguage": "python", + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "programmingLanguage": "javascript", + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + page = self.search.origin_search( + url_pattern="foobar", programming_languages=["python"] + ) + results = [r["url"] for r in page.results] + assert results == [ORIGINS[0]["url"]] + + page = self.search.origin_search( + url_pattern="foobar", programming_languages=["javascript"] + ) + results = [r["url"] for r in page.results] + assert results == [ORIGINS[1]["url"]] + + page = self.search.origin_search( + url_pattern="foobar", programming_languages=["python", "javascript"] + ) + results = [r["url"] for r in page.results] + assert sorted(results) == sorted([o["url"] for o in ORIGINS]) + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types,