diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -24,7 +24,7 @@ ) from swh.search.metrics import send_metric, timed from swh.search.translator import Translator -from swh.search.utils import escape, get_expansion, is_date_parsable +from swh.search.utils import escape, get_expansion, parse_and_format_date logger = logging.getLogger(__name__) @@ -79,8 +79,11 @@ # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) - if not is_date_parsable(date): + formatted_date = parse_and_format_date(date) + if formatted_date is None: intrinsic_metadata.pop(date_field) + else: + intrinsic_metadata[date_field] = formatted_date res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -18,7 +18,7 @@ OriginDict, PagedResult, ) -from swh.search.utils import get_expansion, is_date_parsable +from swh.search.utils import get_expansion, parse_and_format_date _words_regexp = re.compile(r"\w+") @@ -242,8 +242,11 @@ # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) - if not is_date_parsable(date): + formatted_date = parse_and_format_date(date) + if formatted_date is None: intrinsic_metadata.pop(date_field) + else: + intrinsic_metadata[date_field] = formatted_date document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -478,6 +478,52 @@ sort_by=["date_created"], origin_indices=[0, 1, 2], sort_results=False ) + def test_origin_instrinsic_metadata_dates_processing(self): + + DATE_0 = "foo" # will be discarded + DATE_1 = "2001-2-13" # will be formatted to 2001-02-13 + DATE_2 = "2005-10-2" # will be formatted to 2005-10-02 + + ORIGINS = [ + { + "url": "http://foobar.0.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_0, + "dateModified": DATE_1, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.1.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_1, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + { + "url": "http://foobar.2.com", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "dateCreated": DATE_2, + "dateModified": DATE_2, + "datePublished": DATE_2, + }, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + # check origins have been successfully processed + page = self.search.origin_search(url_pattern="foobar") + assert {r["url"] for r in page.results} == { + "http://foobar.0.com", + "http://foobar.2.com", + "http://foobar.1.com", + } + def test_origin_keywords_search(self): ORIGINS = [ { diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -1,3 +1,8 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import pytest from swh.search.translator import Translator diff --git a/swh/search/tests/test_utils.py b/swh/search/tests/test_utils.py new file mode 100644 --- /dev/null +++ b/swh/search/tests/test_utils.py @@ -0,0 +1,23 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.search.utils import parse_and_format_date + + +@pytest.mark.parametrize( + "date_str", + ["2021-07-03", "2021-7-03", "2021-07-3", "2021-7-3", "2021-07-03T15:17:08Z"], +) +def test_parse_and_format_date_success(date_str): + assert parse_and_format_date(date_str) == "2021-07-03" + + +@pytest.mark.parametrize( + "date_str", ["foo", "2021/07/03", "2021+07+03T15,17,08Z"], +) +def test_parse_and_format_date_failure(date_str): + assert parse_and_format_date(date_str) is None diff --git a/swh/search/utils.py b/swh/search/utils.py --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -5,6 +5,7 @@ import codecs from datetime import datetime +from typing import Optional import iso8601 # type: ignore @@ -46,21 +47,19 @@ return METADATA_FIELDS[field] -def is_date_parsable(date_str): +def parse_and_format_date(date_str: str) -> Optional[str]: """ - Return True if date_str is in the format - %Y-%m-%d or the standard ISO format. - Otherwise return False. + Parses a string date in the format %Y-%m-%d or ISO8601 and returns + a new string date in the format YYYY-mm-dd if the parsing succeeded + otherwise None. """ try: - datetime.strptime(date_str, "%Y-%m-%d") - return True + return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d") except Exception: try: - iso8601.parse_date(date_str) - return True + return iso8601.parse_date(date_str).strftime("%Y-%m-%d") except Exception: - return False + return None def escape(obj):