Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9338432
lister.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
4 KB
Subscribers
None
lister.py
View Options
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
re
import
logging
from
urllib.parse
import
urlparse
,
urljoin
from
bs4
import
BeautifulSoup
from
requests
import
Session
from
requests.adapters
import
HTTPAdapter
from
.models
import
CGitModel
from
swh.core.utils
import
grouper
from
swh.lister.core.lister_base
import
ListerBase
logger
=
logging
.
getLogger
(
__name__
)
class
CGitLister
(
ListerBase
):
"""Lister class for CGit repositories.
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
For each found git repository, a query is made at the given url found
in this index to gather published "Clone" URLs to be used as origin
URL for that git repo.
If several "Clone" urls are provided, prefer the http/https one, if
any, otherwise fall bak to the first one.
A loader task is created for each git repository:
Task:
Type: load-git
Policy: recurring
Args:
<git_clonable_url>
Example:
Type: load-git
Policy: recurring
Args:
'https://git.savannah.gnu.org/git/elisp-es.git'
"""
MODEL
=
CGitModel
DEFAULT_URL
=
'https://git.savannah.gnu.org/cgit/'
LISTER_NAME
=
'cgit'
url_prefix_present
=
True
def
__init__
(
self
,
url
=
None
,
instance
=
None
,
override_config
=
None
):
"""Lister class for CGit repositories.
Args:
url (str): main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
instance (str): Name of cgit instance. Defaults to url's hostname
if unset.
"""
super
()
.
__init__
(
override_config
=
override_config
)
if
url
is
None
:
url
=
self
.
config
.
get
(
'url'
,
self
.
DEFAULT_URL
)
self
.
url
=
url
if
not
instance
:
instance
=
urlparse
(
url
)
.
hostname
self
.
instance
=
instance
self
.
session
=
Session
()
self
.
session
.
mount
(
self
.
url
,
HTTPAdapter
(
max_retries
=
3
))
def
run
(
self
):
total
=
0
for
repos
in
grouper
(
self
.
get_repos
(),
10
):
models
=
list
(
filter
(
None
,
(
self
.
build_model
(
repo
)
for
repo
in
repos
)))
injected_repos
=
self
.
inject_repo_data_into_db
(
models
)
self
.
schedule_missing_tasks
(
models
,
injected_repos
)
self
.
db_session
.
commit
()
total
+=
len
(
injected_repos
)
logger
.
debug
(
'Scheduled
%s
tasks for
%s
'
,
total
,
self
.
url
)
def
get_repos
(
self
):
"""Generate git 'project' URLs found on the current CGit server
"""
next_page
=
self
.
url
while
next_page
:
bs_idx
=
self
.
get_and_parse
(
next_page
)
for
tr
in
bs_idx
.
find
(
'div'
,
{
"class"
:
"content"
})
.
find_all
(
"tr"
,
{
"class"
:
""
}):
yield
urljoin
(
self
.
url
,
tr
.
find
(
'a'
)[
'href'
])
try
:
pager
=
bs_idx
.
find
(
'ul'
,
{
'class'
:
'pager'
})
current_page
=
pager
.
find
(
'a'
,
{
'class'
:
'current'
})
if
current_page
:
next_page
=
current_page
.
parent
.
next_sibling
.
a
[
'href'
]
next_page
=
urljoin
(
self
.
url
,
next_page
)
except
(
AttributeError
,
KeyError
):
# no pager, or no next page
next_page
=
None
def
build_model
(
self
,
repo_url
):
"""Given the URL of a git repo project page on a CGit server,
return the repo description (dict) suitable for insertion in the db.
"""
bs
=
self
.
get_and_parse
(
repo_url
)
urls
=
[
x
[
'href'
]
for
x
in
bs
.
find_all
(
'a'
,
{
'rel'
:
'vcs-git'
})]
if
not
urls
:
return
# look for the http/https url, if any, and use it as origin_url
for
url
in
urls
:
if
urlparse
(
url
)
.
scheme
in
(
'http'
,
'https'
):
origin_url
=
url
break
else
:
# otherwise, choose the first one
origin_url
=
urls
[
0
]
return
{
'uid'
:
repo_url
,
'name'
:
bs
.
find
(
'a'
,
title
=
re
.
compile
(
'.+'
))[
'title'
],
'origin_type'
:
'git'
,
'instance'
:
self
.
instance
,
'origin_url'
:
origin_url
,
}
def
get_and_parse
(
self
,
url
):
"Get the given url and parse the retrieved HTML using BeautifulSoup"
return
BeautifulSoup
(
self
.
session
.
get
(
url
)
.
text
,
features
=
'html.parser'
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Jul 4 2025, 8:48 AM (7 w, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3240997
Attached To
rDLS Listers
Event Timeline
Log In to Comment