Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/sourceforge/tests/test_lister.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | |||||
import functools | import functools | ||||
import json | import json | ||||
from pathlib import Path | from pathlib import Path | ||||
import re | import re | ||||
from iso8601 import iso8601 | |||||
import pytest | import pytest | ||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.sourceforge.lister import ( | from swh.lister.sourceforge.lister import ( | ||||
MAIN_SITEMAP_URL, | MAIN_SITEMAP_URL, | ||||
PROJECT_API_URL_FORMAT, | PROJECT_API_URL_FORMAT, | ||||
SourceForgeLister, | SourceForgeLister, | ||||
SourceForgeListerState, | |||||
) | ) | ||||
# Mapping of project name to namespace | # Mapping of project name to namespace | ||||
from swh.scheduler.model import ListedOrigin | |||||
TEST_PROJECTS = { | TEST_PROJECTS = { | ||||
"adobexmp": "adobe", | "adobexmp": "adobe", | ||||
"backapps": "p", | "backapps": "p", | ||||
"backapps/website": "p", | "backapps/website": "p", | ||||
"mojunk": "p", | "mojunk": "p", | ||||
"mramm": "p", | "mramm": "p", | ||||
"os3dmodels": "p", | "os3dmodels": "p", | ||||
} | } | ||||
Show All 23 Lines | def get_project_json(datadir, request, context): | ||||
project = project.replace("/", "-") | project = project.replace("/", "-") | ||||
return json.loads(Path(datadir, f"{project}.json").read_text()) | return json.loads(Path(datadir, f"{project}.json").read_text()) | ||||
def _check_request_headers(request): | def _check_request_headers(request): | ||||
return request.headers.get("User-Agent") == USER_AGENT | return request.headers.get("User-Agent") == USER_AGENT | ||||
def _check_listed_origins(lister, swh_scheduler): | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} | |||||
assert res == { | |||||
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), | |||||
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), | |||||
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), | |||||
"git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), | |||||
"git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), | |||||
"svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), | |||||
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), | |||||
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), | |||||
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), | |||||
} | |||||
def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): | def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): | ||||
""" | """ | ||||
Simulate a full listing of an artificially restricted sourceforge. | Simulate a full listing of an artificially restricted sourceforge. | ||||
There are 5 different projects, spread over two sub-sitemaps, a few of which | There are 5 different projects, spread over two sub-sitemaps, a few of which | ||||
have multiple VCS listed, one has none, one is outside of the standard `/p/` | have multiple VCS listed, one has none, one is outside of the standard `/p/` | ||||
namespace, some with custom mount points. | namespace, some with custom mount points. | ||||
All non-interesting but related entries have been kept. | All non-interesting but related entries have been kept. | ||||
""" | """ | ||||
Show All 23 Lines | def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): | ||||
stats = lister.run() | stats = lister.run() | ||||
# - os3dmodels (2 repos), | # - os3dmodels (2 repos), | ||||
# - mramm (3 repos), | # - mramm (3 repos), | ||||
# - mojunk (3 repos), | # - mojunk (3 repos), | ||||
# - backapps/website (1 repo). | # - backapps/website (1 repo). | ||||
# adobe and backapps itself have no repos. | # adobe and backapps itself have no repos. | ||||
assert stats.pages == 4 | assert stats.pages == 4 | ||||
assert stats.origins == 9 | assert stats.origins == 9 | ||||
expected_state = { | |||||
"subsitemap_last_modified": { | |||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", | |||||
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", | |||||
}, | |||||
"empty_projects": { | |||||
"https://sourceforge.net/rest/p/backapps": "2021-02-11", | |||||
"https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", | |||||
}, | |||||
} | |||||
assert lister.state_to_dict(lister.state) == expected_state | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | _check_listed_origins(lister, swh_scheduler) | ||||
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} | |||||
assert res == { | |||||
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), | def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, mocker): | ||||
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), | """ | ||||
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), | Simulate an incremental listing of an artificially restricted sourceforge. | ||||
"git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), | Same dataset as the full run, because it's enough to validate the different cases. | ||||
"git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), | """ | ||||
"svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), | lister = SourceForgeLister(scheduler=swh_scheduler, incremental=True) | ||||
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), | |||||
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), | requests_mock.get( | ||||
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), | MAIN_SITEMAP_URL, | ||||
text=get_main_sitemap(datadir), | |||||
additional_matcher=_check_request_headers, | |||||
) | |||||
def not_called(request, *args, **kwargs): | |||||
raise AssertionError(f"Should not have been called: '{request.url}'") | |||||
requests_mock.get( | |||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml", | |||||
text=get_subsitemap_0(datadir), | |||||
additional_matcher=_check_request_headers, | |||||
) | |||||
requests_mock.get( | |||||
"https://sourceforge.net/allura_sitemap/sitemap-1.xml", | |||||
text=not_called, | |||||
additional_matcher=_check_request_headers, | |||||
) | |||||
def filtered_get_project_json(request, context): | |||||
# These projects should not be requested again | |||||
assert URLS_MATCHER[request.url] not in {"adobe", "mojunk"} | |||||
return get_project_json(datadir, request, context) | |||||
requests_mock.get( | |||||
re.compile("https://sourceforge.net/rest/.*"), | |||||
json=filtered_get_project_json, | |||||
additional_matcher=_check_request_headers, | |||||
) | |||||
faked_listed_origins = [ | |||||
# mramm: changed | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="git", | |||||
url="git.code.sf.net/p/mramm/files", | |||||
last_update=iso8601.parse_date("2019-01-01"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="git", | |||||
url="git.code.sf.net/p/mramm/git", | |||||
last_update=iso8601.parse_date("2019-01-01"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="svn", | |||||
url="svn.code.sf.net/p/mramm/svn", | |||||
last_update=iso8601.parse_date("2019-01-01"), | |||||
), | |||||
# stayed the same, even though its subsitemap has changed | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="git", | |||||
url="git.code.sf.net/p/os3dmodels/git", | |||||
last_update=iso8601.parse_date("2017-03-31"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="svn", | |||||
url="svn.code.sf.net/p/os3dmodels/svn", | |||||
last_update=iso8601.parse_date("2017-03-31"), | |||||
), | |||||
# others: stayed the same, should be skipped | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="git", | |||||
url="git.code.sf.net/p/mojunk/git", | |||||
last_update=iso8601.parse_date("2017-12-31"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="git", | |||||
url="git.code.sf.net/p/mojunk/git2", | |||||
last_update=iso8601.parse_date("2017-12-31"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="svn", | |||||
url="svn.code.sf.net/p/mojunk/svn", | |||||
last_update=iso8601.parse_date("2017-12-31"), | |||||
), | |||||
ListedOrigin( | |||||
lister_id=lister.lister_obj.id, | |||||
visit_type="svn", | |||||
url="svn.code.sf.net/p/backapps/website/code", | |||||
last_update=iso8601.parse_date("2021-02-11"), | |||||
), | |||||
] | |||||
swh_scheduler.record_listed_origins(faked_listed_origins) | |||||
to_date = datetime.date.fromisoformat | |||||
faked_state = SourceForgeListerState( | |||||
subsitemap_last_modified={ | |||||
# changed | |||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": to_date( | |||||
"2021-02-18" | |||||
), | |||||
# stayed the same | |||||
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": to_date( | |||||
"2021-03-18" | |||||
), | |||||
}, | |||||
empty_projects={ | |||||
"https://sourceforge.net/rest/p/backapps": to_date("2020-02-11"), | |||||
"https://sourceforge.net/rest/adobe/adobexmp": to_date("2017-10-17"), | |||||
}, | |||||
) | |||||
lister.state = faked_state | |||||
stats = lister.run() | |||||
# - mramm (3 repos), # changed | |||||
assert stats.pages == 1 | |||||
assert stats.origins == 3 | |||||
vlorentz: Can you compare the values? | |||||
Done Inline ActionsI'm not sure which values you're refering to. Alphare: I'm not sure which values you're refering to. | |||||
Not Done Inline Actionsnvm, my comment doesn't make sense. I most have read your code as assert len(stats.pages) == 1 vlorentz: nvm, my comment doesn't make sense. I most have read your code as `assert len(stats.pages) == 1` | |||||
expected_state = { | |||||
"subsitemap_last_modified": { | |||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", | |||||
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", | |||||
}, | |||||
"empty_projects": { | |||||
"https://sourceforge.net/rest/p/backapps": "2021-02-11", # changed | |||||
"https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", | |||||
}, | |||||
} | } | ||||
assert lister.state_to_dict(lister.state) == expected_state | |||||
# origins have been updated | |||||
_check_listed_origins(lister, swh_scheduler) | |||||
def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): | def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): | ||||
# Exponential retries take a long time, so stub time.sleep | # Exponential retries take a long time, so stub time.sleep | ||||
mocked_sleep = mocker.patch("time.sleep", return_value=None) | mocked_sleep = mocker.patch("time.sleep", return_value=None) | ||||
lister = SourceForgeLister(scheduler=swh_scheduler) | lister = SourceForgeLister(scheduler=swh_scheduler) | ||||
▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines |
Can you compare the values?