diff --git a/swh/web/api/utils.py b/swh/web/api/utils.py --- a/swh/web/api/utils.py +++ b/swh/web/api/utils.py @@ -238,7 +238,7 @@ ) revision["children_urls"] = children - if "message_decoding_failed" in revision: + if "decoding_failures" in revision and "message" in revision["decoding_failures"]: revision["message_url"] = reverse( "api-1-revision-raw-message", url_args={"sha1_git": revision["id"]}, diff --git a/swh/web/common/converters.py b/swh/web/common/converters.py --- a/swh/web/common/converters.py +++ b/swh/web/common/converters.py @@ -279,7 +279,7 @@ revision_d = from_swh( revision_d, hashess={"id", "directory", "parents", "children"}, - bytess={"name", "fullname", "email", "extra_headers"}, + bytess={"name", "fullname", "email", "extra_headers", "message"}, convert={"metadata"}, convert_fn=convert_revision_metadata, dates={"date", "committer_date"}, @@ -288,12 +288,6 @@ if revision_d: if "parents" in revision_d: revision_d["merge"] = len(revision_d["parents"]) > 1 - if "message" in revision_d: - try: - revision_d["message"] = revision_d["message"].decode("utf-8") - except UnicodeDecodeError: - revision_d["message_decoding_failed"] = True - revision_d["message"] = None return revision_d diff --git a/swh/web/templates/includes/apidoc-header.html b/swh/web/templates/includes/apidoc-header.html --- a/swh/web/templates/includes/apidoc-header.html +++ b/swh/web/templates/includes/apidoc-header.html @@ -155,6 +155,17 @@

Unavailability of the underlying storage backend will result in a 503 Service Unavailable HTTP response.

+

UTF-8 decoding errors

+

While attempting to decode UTF-8 strings from raw bytes stored in the archive, some errors +might happen when generating an API response. In that case, an extra field decoding_failures +will be added to each concerned JSON object (possibly nested). It will contain the list of its key +names where UTF-8 decoding failed. +

+

+A string that could not be decoded will have the bytes of its invalid UTF-8 sequences escaped as +\\x<hex value>. +

+

Pagination

Requests that might potentially return many items will be paginated.

Page size is set to a default (usually: 10 items), but might be overridden with the diff --git a/swh/web/tests/api/test_utils.py b/swh/web/tests/api/test_utils.py --- a/swh/web/tests/api/test_utils.py +++ b/swh/web/tests/api/test_utils.py @@ -471,8 +471,7 @@ ): revision_data = archive_data.revision_get(revision) - revision_data["message"] = None - revision_data["message_decoding_failed"] = (True,) + revision_data["decoding_failures"] = ["message"] revision_data["parents"] = revision_data["parents"] + (parent_revision,) revision_data["children"] = child_revision diff --git a/swh/web/tests/common/test_converters.py b/swh/web/tests/common/test_converters.py --- a/swh/web/tests/common/test_converters.py +++ b/swh/web/tests/common/test_converters.py @@ -608,8 +608,8 @@ "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, - "message": None, - "message_decoding_failed": True, + "message": "invalid message \\xff", + "decoding_failures": ["message"], "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], diff --git a/swh/web/tests/common/test_service.py b/swh/web/tests/common/test_service.py --- a/swh/web/tests/common/test_service.py +++ b/swh/web/tests/common/test_service.py @@ -522,8 +522,8 @@ archive_data.revision_add([Revision.from_dict(new_revision)]) revision = service.lookup_revision(hash_to_hex(new_revision["id"])) - assert revision["message"] is None - assert revision["message_decoding_failed"] is True + assert revision["message"] == "elegant fix for bug \\xff" + assert revision["decoding_failures"] == ["message"] @given(new_revision())