diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -202,18 +202,24 @@ information on the origin. """ - resp = requests.head(origin_url) + resp = requests.head(origin_url, allow_redirects=True) exists = resp.ok content_length: Optional[int] = None last_modified: Optional[str] = None if exists: - size_ = resp.headers.get("Content-Length") + # Also process X-Archive-Orig-* headers in case the URL targets the + # Internet Archive. + size_ = resp.headers.get( + "Content-Length", resp.headers.get("X-Archive-Orig-Content-Length") + ) content_length = int(size_) if size_ else None try: - date_str = resp.headers["Last-Modified"] + date_str = resp.headers.get( + "Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "") + ) date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") last_modified = date.isoformat() - except (KeyError, ValueError): + except ValueError: # if not provided or not parsable as per the expected format, keep it None pass diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -420,6 +420,38 @@ ) +def test_origin_exists_internet_archive(requests_mock): + """Edge case where an artifact URL to check existence is hosted on the + Internet Archive""" + url = ( + "https://web.archive.org/web/20100705043309/" + "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" + ) + redirect_url = ( + "https://web.archive.org/web/20100610004108/" + "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" + ) + requests_mock.head( + url, status_code=302, headers={"Location": redirect_url,}, + ) + requests_mock.head( + redirect_url, + status_code=200, + headers={ + "X-Archive-Orig-Last-Modified": "Tue, 12 May 2009 22:09:43 GMT", + "X-Archive-Orig-Content-Length": "121421", + }, + ) + + actual_result = origin_exists(url) + assert actual_result == OriginExistenceCheckInfo( + origin_url=url, + exists=True, + content_length=121421, + last_modified="2009-05-12T22:09:43", + ) + + def test_origin_exists_200_with_data_unexpected_date_format(requests_mock): """Existing origin should be ok, unexpected last modif time result in no time""" url = "http://example.org/real-url2"