Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9313864
test_lister.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
test_lister.py
View Options
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
datetime
import
functools
import
json
from
pathlib
import
Path
import
re
from
iso8601
import
iso8601
import
pytest
from
requests.exceptions
import
HTTPError
from
swh.lister
import
USER_AGENT_TEMPLATE
from
swh.lister.sourceforge.lister
import
(
MAIN_SITEMAP_URL
,
PROJECT_API_URL_FORMAT
,
SourceForgeLister
,
SourceForgeListerState
,
)
from
swh.lister.tests.test_utils
import
assert_sleep_calls
from
swh.lister.utils
import
WAIT_EXP_BASE
# Mapping of project name to namespace
from
swh.scheduler.model
import
ListedOrigin
TEST_PROJECTS
=
{
"aaron"
:
"p"
,
"adobexmp"
:
"adobe"
,
"backapps"
:
"p"
,
"backapps/website"
:
"p"
,
"bzr-repo"
:
"p"
,
"mojunk"
:
"p"
,
"mramm"
:
"p"
,
"os3dmodels"
:
"p"
,
"random-mercurial"
:
"p"
,
"t12eksandbox"
:
"p"
,
"ocaml-lpd"
:
"p"
,
}
URLS_MATCHER
=
{
PROJECT_API_URL_FORMAT
.
format
(
namespace
=
namespace
,
project
=
project
):
project
for
project
,
namespace
in
TEST_PROJECTS
.
items
()
}
def
get_main_sitemap
(
datadir
):
return
Path
(
datadir
,
"main-sitemap.xml"
)
.
read_text
()
def
get_subsitemap_0
(
datadir
):
return
Path
(
datadir
,
"subsitemap-0.xml"
)
.
read_text
()
def
get_subsitemap_1
(
datadir
):
return
Path
(
datadir
,
"subsitemap-1.xml"
)
.
read_text
()
def
get_project_json
(
datadir
,
request
,
context
):
url
=
request
.
url
project
=
URLS_MATCHER
.
get
(
url
)
assert
project
is
not
None
,
f
"Url '{url}' could not be matched"
project
=
project
.
replace
(
"/"
,
"-"
)
return
json
.
loads
(
Path
(
datadir
,
f
"{project}.json"
)
.
read_text
())
def
get_cvs_info_page
(
datadir
):
return
Path
(
datadir
,
"aaron.html"
)
.
read_text
()
def
get_bzr_repo_page
(
datadir
,
repo_name
):
return
Path
(
datadir
,
f
"{repo_name}.html"
)
.
read_text
()
def
_check_request_headers
(
request
):
return
(
request
.
headers
.
get
(
"User-Agent"
)
==
USER_AGENT_TEMPLATE
%
SourceForgeLister
.
LISTER_NAME
)
def
_check_listed_origins
(
lister
,
swh_scheduler
):
scheduler_origins
=
swh_scheduler
.
get_listed_origins
(
lister
.
lister_obj
.
id
)
.
results
res
=
{
o
.
url
:
(
o
.
visit_type
,
str
(
o
.
last_update
.
date
()))
for
o
in
scheduler_origins
}
assert
res
==
{
"https://svn.code.sf.net/p/backapps/website/code"
:
(
"svn"
,
"2021-02-11"
),
"https://git.code.sf.net/p/os3dmodels/git"
:
(
"git"
,
"2017-03-31"
),
"https://svn.code.sf.net/p/os3dmodels/svn"
:
(
"svn"
,
"2017-03-31"
),
"https://git.code.sf.net/p/mramm/files"
:
(
"git"
,
"2019-04-04"
),
"https://git.code.sf.net/p/mramm/git"
:
(
"git"
,
"2019-04-04"
),
"https://svn.code.sf.net/p/mramm/svn"
:
(
"svn"
,
"2019-04-04"
),
"https://git.code.sf.net/p/mojunk/git"
:
(
"git"
,
"2017-12-31"
),
"https://git.code.sf.net/p/mojunk/git2"
:
(
"git"
,
"2017-12-31"
),
"https://svn.code.sf.net/p/mojunk/svn"
:
(
"svn"
,
"2017-12-31"
),
"http://hg.code.sf.net/p/random-mercurial/hg"
:
(
"hg"
,
"2019-05-02"
),
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
:
(
"bzr"
,
"2011-02-09"
,
),
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk"
:
(
"bzr"
,
"2011-04-17"
,
),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron"
:
(
"cvs"
,
"2013-03-07"
),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/www"
:
(
"cvs"
,
"2013-03-07"
),
}
def
test_sourceforge_lister_full
(
swh_scheduler
,
requests_mock
,
datadir
):
"""
Simulate a full listing of an artificially restricted sourceforge.
There are 5 different projects, spread over two sub-sitemaps, a few of which
have multiple VCS listed, one has none, one is outside of the standard `/p/`
namespace, some with custom mount points.
All non-interesting but related entries have been kept.
"""
lister
=
SourceForgeLister
(
scheduler
=
swh_scheduler
)
requests_mock
.
get
(
MAIN_SITEMAP_URL
,
text
=
get_main_sitemap
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
,
text
=
get_subsitemap_0
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
,
text
=
get_subsitemap_1
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"https://sourceforge.net/rest/.*"
),
json
=
functools
.
partial
(
get_project_json
,
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://aaron.cvs.sourceforge.net/"
),
text
=
get_cvs_info_page
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
),
text
=
get_bzr_repo_page
(
datadir
,
"t12eksandbox"
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"
),
text
=
get_bzr_repo_page
(
datadir
,
"ocaml-lpd"
),
additional_matcher
=
_check_request_headers
,
)
stats
=
lister
.
run
()
# - os3dmodels (2 repos),
# - mramm (3 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - t12eksandbox (1 repo).
# - ocaml-lpd (1 repo).
# adobe and backapps itself have no repos.
assert
stats
.
pages
==
8
assert
stats
.
origins
==
14
expected_state
=
{
"subsitemap_last_modified"
:
{
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
:
"2021-03-18"
,
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
:
"2021-03-18"
,
},
"empty_projects"
:
{
"https://sourceforge.net/rest/p/backapps"
:
"2021-02-11"
,
"https://sourceforge.net/rest/adobe/adobexmp"
:
"2017-10-17"
,
},
}
assert
lister
.
state_to_dict
(
lister
.
state
)
==
expected_state
_check_listed_origins
(
lister
,
swh_scheduler
)
def
test_sourceforge_lister_incremental
(
swh_scheduler
,
requests_mock
,
datadir
,
mocker
):
"""
Simulate an incremental listing of an artificially restricted sourceforge.
Same dataset as the full run, because it's enough to validate the different cases.
"""
lister
=
SourceForgeLister
(
scheduler
=
swh_scheduler
,
incremental
=
True
)
requests_mock
.
get
(
MAIN_SITEMAP_URL
,
text
=
get_main_sitemap
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
def
not_called
(
request
,
*
args
,
**
kwargs
):
raise
AssertionError
(
f
"Should not have been called: '{request.url}'"
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
,
text
=
get_subsitemap_0
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
,
text
=
not_called
,
additional_matcher
=
_check_request_headers
,
)
def
filtered_get_project_json
(
request
,
context
):
# These projects should not be requested again
assert
URLS_MATCHER
[
request
.
url
]
not
in
{
"adobe"
,
"mojunk"
}
return
get_project_json
(
datadir
,
request
,
context
)
requests_mock
.
get
(
re
.
compile
(
"https://sourceforge.net/rest/.*"
),
json
=
filtered_get_project_json
,
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://aaron.cvs.sourceforge.net/"
),
text
=
get_cvs_info_page
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
),
text
=
get_bzr_repo_page
(
datadir
,
"t12eksandbox"
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"
),
text
=
get_bzr_repo_page
(
datadir
,
"ocaml-lpd"
),
additional_matcher
=
_check_request_headers
,
)
faked_listed_origins
=
[
# mramm: changed
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"git"
,
url
=
"https://git.code.sf.net/p/mramm/files"
,
last_update
=
iso8601
.
parse_date
(
"2019-01-01"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"git"
,
url
=
"https://git.code.sf.net/p/mramm/git"
,
last_update
=
iso8601
.
parse_date
(
"2019-01-01"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"svn"
,
url
=
"https://svn.code.sf.net/p/mramm/svn"
,
last_update
=
iso8601
.
parse_date
(
"2019-01-01"
),
),
# stayed the same, even though its subsitemap has changed
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"git"
,
url
=
"https://git.code.sf.net/p/os3dmodels/git"
,
last_update
=
iso8601
.
parse_date
(
"2017-03-31"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"svn"
,
url
=
"https://svn.code.sf.net/p/os3dmodels/svn"
,
last_update
=
iso8601
.
parse_date
(
"2017-03-31"
),
),
# others: stayed the same, should be skipped
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"git"
,
url
=
"https://git.code.sf.net/p/mojunk/git"
,
last_update
=
iso8601
.
parse_date
(
"2017-12-31"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"git"
,
url
=
"https://git.code.sf.net/p/mojunk/git2"
,
last_update
=
iso8601
.
parse_date
(
"2017-12-31"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"svn"
,
url
=
"https://svn.code.sf.net/p/mojunk/svn"
,
last_update
=
iso8601
.
parse_date
(
"2017-12-31"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"svn"
,
url
=
"https://svn.code.sf.net/p/backapps/website/code"
,
last_update
=
iso8601
.
parse_date
(
"2021-02-11"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"hg"
,
url
=
"http://hg.code.sf.net/p/random-mercurial/hg"
,
last_update
=
iso8601
.
parse_date
(
"2019-05-02"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"bzr"
,
url
=
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
,
last_update
=
iso8601
.
parse_date
(
"2011-02-09"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"bzr"
,
url
=
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk"
,
last_update
=
iso8601
.
parse_date
(
"2011-04-17"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"cvs"
,
url
=
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron"
,
last_update
=
iso8601
.
parse_date
(
"2013-03-07"
),
),
ListedOrigin
(
lister_id
=
lister
.
lister_obj
.
id
,
visit_type
=
"cvs"
,
url
=
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/www"
,
last_update
=
iso8601
.
parse_date
(
"2013-03-07"
),
),
]
swh_scheduler
.
record_listed_origins
(
faked_listed_origins
)
to_date
=
datetime
.
date
.
fromisoformat
faked_state
=
SourceForgeListerState
(
subsitemap_last_modified
=
{
# changed
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
:
to_date
(
"2021-02-18"
),
# stayed the same
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
:
to_date
(
"2021-03-18"
),
},
empty_projects
=
{
"https://sourceforge.net/rest/p/backapps"
:
to_date
(
"2020-02-11"
),
"https://sourceforge.net/rest/adobe/adobexmp"
:
to_date
(
"2017-10-17"
),
},
)
lister
.
state
=
faked_state
stats
=
lister
.
run
()
# - mramm (3 repos), # changed
assert
stats
.
pages
==
1
assert
stats
.
origins
==
3
expected_state
=
{
"subsitemap_last_modified"
:
{
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
:
"2021-03-18"
,
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
:
"2021-03-18"
,
},
"empty_projects"
:
{
"https://sourceforge.net/rest/p/backapps"
:
"2021-02-11"
,
# changed
"https://sourceforge.net/rest/adobe/adobexmp"
:
"2017-10-17"
,
},
}
assert
lister
.
state_to_dict
(
lister
.
state
)
==
expected_state
# origins have been updated
_check_listed_origins
(
lister
,
swh_scheduler
)
def
test_sourceforge_lister_retry
(
swh_scheduler
,
requests_mock
,
mocker
,
datadir
):
lister
=
SourceForgeLister
(
scheduler
=
swh_scheduler
)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep
=
mocker
.
patch
.
object
(
lister
.
http_request
.
retry
,
"sleep"
)
requests_mock
.
get
(
MAIN_SITEMAP_URL
,
[
{
"status_code"
:
429
},
{
"status_code"
:
429
},
{
"text"
:
get_main_sitemap
(
datadir
)},
],
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
,
[{
"status_code"
:
429
},
{
"text"
:
get_subsitemap_0
(
datadir
),
"status_code"
:
301
}],
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
,
[{
"status_code"
:
429
},
{
"text"
:
get_subsitemap_1
(
datadir
)}],
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"https://sourceforge.net/rest/.*"
),
[{
"status_code"
:
429
},
{
"json"
:
functools
.
partial
(
get_project_json
,
datadir
)}],
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://aaron.cvs.sourceforge.net/"
),
text
=
get_cvs_info_page
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
),
text
=
get_bzr_repo_page
(
datadir
,
"t12eksandbox"
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"
),
text
=
get_bzr_repo_page
(
datadir
,
"ocaml-lpd"
),
additional_matcher
=
_check_request_headers
,
)
stats
=
lister
.
run
()
# - os3dmodels (2 repos),
# - mramm (3 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - t12eksandbox (1 repo).
# - ocaml-lpd (1 repo).
# adobe and backapps itself have no repos.
assert
stats
.
pages
==
8
assert
stats
.
origins
==
14
_check_listed_origins
(
lister
,
swh_scheduler
)
# Test `time.sleep` is called with exponential retries
assert_sleep_calls
(
mocker
,
mocked_sleep
,
[
1
,
WAIT_EXP_BASE
,
1
,
1
])
@pytest.mark.parametrize
(
"status_code"
,
[
500
,
503
,
504
,
403
,
404
])
def
test_sourceforge_lister_http_error
(
swh_scheduler
,
requests_mock
,
status_code
,
mocker
):
lister
=
SourceForgeLister
(
scheduler
=
swh_scheduler
)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep
=
mocker
.
patch
.
object
(
lister
.
http_request
.
retry
,
"sleep"
)
requests_mock
.
get
(
MAIN_SITEMAP_URL
,
status_code
=
status_code
)
with
pytest
.
raises
(
HTTPError
):
lister
.
run
()
exp_retries
=
[]
if
status_code
>=
500
:
exp_retries
=
[
1.0
,
10.0
,
100.0
,
1000.0
]
assert_sleep_calls
(
mocker
,
mocked_sleep
,
exp_retries
)
@pytest.mark.parametrize
(
"status_code"
,
[
500
,
503
,
504
,
403
,
404
])
def
test_sourceforge_lister_project_error
(
datadir
,
swh_scheduler
,
requests_mock
,
status_code
,
mocker
):
lister
=
SourceForgeLister
(
scheduler
=
swh_scheduler
)
# Exponential retries take a long time, so stub time.sleep
mocker
.
patch
.
object
(
lister
.
http_request
.
retry
,
"sleep"
)
requests_mock
.
get
(
MAIN_SITEMAP_URL
,
text
=
get_main_sitemap
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml"
,
text
=
get_subsitemap_0
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml"
,
text
=
get_subsitemap_1
(
datadir
),
additional_matcher
=
_check_request_headers
,
)
# Request mocks precedence is LIFO
requests_mock
.
get
(
re
.
compile
(
"https://sourceforge.net/rest/.*"
),
json
=
functools
.
partial
(
get_project_json
,
datadir
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
),
text
=
get_bzr_repo_page
(
datadir
,
"t12eksandbox"
),
additional_matcher
=
_check_request_headers
,
)
requests_mock
.
get
(
re
.
compile
(
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"
),
text
=
get_bzr_repo_page
(
datadir
,
"ocaml-lpd"
),
additional_matcher
=
_check_request_headers
,
)
# Make all `mramm` requests fail
# `mramm` is in subsitemap 0, which ensures we keep listing after an error.
requests_mock
.
get
(
re
.
compile
(
"https://sourceforge.net/rest/p/mramm"
),
status_code
=
status_code
)
# Make request to CVS info page fail
requests_mock
.
get
(
re
.
compile
(
"http://aaron.cvs.sourceforge.net/"
),
status_code
=
status_code
)
stats
=
lister
.
run
()
# - os3dmodels (2 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - t12eksandbox (1 repo).
# - ocaml-lpd (1 repo).
# adobe and backapps itself have no repos.
# Did *not* list mramm
assert
stats
.
pages
==
6
assert
stats
.
origins
==
9
scheduler_origins
=
swh_scheduler
.
get_listed_origins
(
lister
.
lister_obj
.
id
)
.
results
res
=
{
o
.
url
:
(
o
.
visit_type
,
str
(
o
.
last_update
.
date
()))
for
o
in
scheduler_origins
}
# Ensure no `mramm` origins are listed, but all others are.
assert
res
==
{
"https://svn.code.sf.net/p/backapps/website/code"
:
(
"svn"
,
"2021-02-11"
),
"https://git.code.sf.net/p/os3dmodels/git"
:
(
"git"
,
"2017-03-31"
),
"https://svn.code.sf.net/p/os3dmodels/svn"
:
(
"svn"
,
"2017-03-31"
),
"https://git.code.sf.net/p/mojunk/git"
:
(
"git"
,
"2017-12-31"
),
"https://git.code.sf.net/p/mojunk/git2"
:
(
"git"
,
"2017-12-31"
),
"https://svn.code.sf.net/p/mojunk/svn"
:
(
"svn"
,
"2017-12-31"
),
"http://hg.code.sf.net/p/random-mercurial/hg"
:
(
"hg"
,
"2019-05-02"
),
"http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"
:
(
"bzr"
,
"2011-02-09"
,
),
"http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk"
:
(
"bzr"
,
"2011-04-17"
,
),
}
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Thu, Jul 3, 11:59 AM (2 d, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236845
Attached To
rDLS Listers
Event Timeline
Log In to Comment