Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/sourceforge/lister.py
Show All 9 Lines | |||||
import re | import re | ||||
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple | from typing import Any, Dict, Iterator, List, Optional, Set, Tuple | ||||
from xml.etree import ElementTree | from xml.etree import ElementTree | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import iso8601 | import iso8601 | ||||
import lxml | import lxml | ||||
import requests | import requests | ||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.core.api.classes import stream_results | from swh.core.api.classes import stream_results | ||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class VcsNames(Enum): | class VcsNames(Enum): | ||||
"""Used to filter SourceForge tool names for valid VCS types""" | """Used to filter SourceForge tool names for valid VCS types""" | ||||
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines | ): | ||||
url="https://sourceforge.net", | url="https://sourceforge.net", | ||||
instance="main", | instance="main", | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
# Will hold the currently saved "last modified" dates to compare against our | # Will hold the currently saved "last modified" dates to compare against our | ||||
# requests. | # requests. | ||||
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None | self._project_last_modified: Optional[ProjectsLastModifiedCache] = None | ||||
self.session = requests.Session() | |||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list | self.session.headers.update({"Accept": "application/json"}) | ||||
self.session.headers.update( | |||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | |||||
) | |||||
self.incremental = incremental | self.incremental = incremental | ||||
def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: | def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: | ||||
subsitemaps = { | subsitemaps = { | ||||
k: datetime.date.fromisoformat(v) | k: datetime.date.fromisoformat(v) | ||||
for k, v in d.get("subsitemap_last_modified", {}).items() | for k, v in d.get("subsitemap_last_modified", {}).items() | ||||
} | } | ||||
empty_projects = { | empty_projects = { | ||||
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines | def projects_last_modified(self) -> ProjectsLastModifiedCache: | ||||
# within a project or subproject. An assertion here would be overkill. | # within a project or subproject. An assertion here would be overkill. | ||||
last_modified = origin.last_update | last_modified = origin.last_update | ||||
assert last_modified is not None | assert last_modified is not None | ||||
listed_origins[(namespace, project)] = last_modified.date() | listed_origins[(namespace, project)] = last_modified.date() | ||||
self._project_last_modified = listed_origins | self._project_last_modified = listed_origins | ||||
return listed_origins | return listed_origins | ||||
@http_retry( | |||||
before_sleep=before_sleep_log(logger, logging.WARNING), | |||||
) | |||||
def page_request(self, url, params) -> requests.Response: | |||||
# Log listed URL to ease debugging | |||||
logger.debug("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
# Log response content to ease debugging | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s for URL %s", | |||||
response.status_code, | |||||
response.url, | |||||
) | |||||
# The lister must fail on blocking errors | |||||
response.raise_for_status() | |||||
return response | |||||
def get_pages(self) -> Iterator[SourceForgeListerPage]: | def get_pages(self) -> Iterator[SourceForgeListerPage]: | ||||
""" | """ | ||||
SourceForge has a main XML sitemap that lists its sharded sitemaps for all | SourceForge has a main XML sitemap that lists its sharded sitemaps for all | ||||
projects. | projects. | ||||
Each XML sub-sitemap lists project pages, which are not unique per project: a | Each XML sub-sitemap lists project pages, which are not unique per project: a | ||||
project can have a wiki, a home, a git, an svn, etc. | project can have a wiki, a home, a git, an svn, etc. | ||||
For each unique project, we query an API endpoint that lists (among | For each unique project, we query an API endpoint that lists (among | ||||
other things) the tools associated with said project, some of which are | other things) the tools associated with said project, some of which are | ||||
the VCS used. Subprojects are considered separate projects. | the VCS used. Subprojects are considered separate projects. | ||||
Lastly we use the information of which VCS are used to build the predictable | Lastly we use the information of which VCS are used to build the predictable | ||||
clone URL for any given VCS. | clone URL for any given VCS. | ||||
""" | """ | ||||
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text | sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text | ||||
tree = ElementTree.fromstring(sitemap_contents) | tree = ElementTree.fromstring(sitemap_contents) | ||||
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): | for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): | ||||
last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | ||||
assert last_modified_el is not None and last_modified_el.text is not None | assert last_modified_el is not None and last_modified_el.text is not None | ||||
last_modified = datetime.date.fromisoformat(last_modified_el.text) | last_modified = datetime.date.fromisoformat(last_modified_el.text) | ||||
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") | location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") | ||||
assert location is not None and location.text is not None | assert location is not None and location.text is not None | ||||
sub_url = location.text | sub_url = location.text | ||||
if self.incremental: | if self.incremental: | ||||
recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) | recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) | ||||
if recorded_last_mod == last_modified: | if recorded_last_mod == last_modified: | ||||
# The entire subsitemap hasn't changed, so none of its projects | # The entire subsitemap hasn't changed, so none of its projects | ||||
# have either, skip it. | # have either, skip it. | ||||
continue | continue | ||||
self.state.subsitemap_last_modified[sub_url] = last_modified | self.state.subsitemap_last_modified[sub_url] = last_modified | ||||
subsitemap_contents = self.page_request(sub_url, {}).text | subsitemap_contents = self.http_request(sub_url).text | ||||
subtree = ElementTree.fromstring(subsitemap_contents) | subtree = ElementTree.fromstring(subsitemap_contents) | ||||
yield from self._get_pages_from_subsitemap(subtree) | yield from self._get_pages_from_subsitemap(subtree) | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, page: SourceForgeListerPage | self, page: SourceForgeListerPage | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines | ) -> SourceForgeListerPage: | ||||
return [] | return [] | ||||
else: | else: | ||||
logger.debug(f"Project {namespace}/{project} was updated") | logger.debug(f"Project {namespace}/{project} was updated") | ||||
else: | else: | ||||
msg = "New project during an incremental run: %s/%s" | msg = "New project during an incremental run: %s/%s" | ||||
logger.debug(msg, namespace, project) | logger.debug(msg, namespace, project) | ||||
try: | try: | ||||
res = self.page_request(endpoint, {}).json() | res = self.http_request(endpoint).json() | ||||
except requests.HTTPError: | except requests.HTTPError: | ||||
# We've already logged in `page_request` | # We've already logged in `http_request` | ||||
return [] | return [] | ||||
tools = res.get("tools") | tools = res.get("tools") | ||||
if tools is None: | if tools is None: | ||||
# This rarely happens, on very old URLs | # This rarely happens, on very old URLs | ||||
logger.warning("Project '%s' does not have any tools", endpoint) | logger.warning("Project '%s' does not have any tools", endpoint) | ||||
return [] | return [] | ||||
hits = [] | hits = [] | ||||
for tool in tools: | for tool in tools: | ||||
tool_name = tool["name"] | tool_name = tool["name"] | ||||
if tool_name not in VCS_NAMES: | if tool_name not in VCS_NAMES: | ||||
continue | continue | ||||
if tool_name == VcsNames.CVS.value: | if tool_name == VcsNames.CVS.value: | ||||
# CVS projects are different from other VCS ones, they use the rsync | # CVS projects are different from other VCS ones, they use the rsync | ||||
# protocol, a list of modules needs to be fetched from an info page | # protocol, a list of modules needs to be fetched from an info page | ||||
# and multiple origin URLs can be produced for a same project. | # and multiple origin URLs can be produced for a same project. | ||||
cvs_info_url = f"http://{project}.cvs.sourceforge.net" | cvs_info_url = f"http://{project}.cvs.sourceforge.net" | ||||
try: | try: | ||||
response = self.page_request(cvs_info_url, params={}) | response = self.http_request(cvs_info_url) | ||||
except requests.HTTPError: | except requests.HTTPError: | ||||
logger.warning( | logger.warning( | ||||
"CVS info page could not be fetched, skipping project '%s'", | "CVS info page could not be fetched, skipping project '%s'", | ||||
project, | project, | ||||
) | ) | ||||
continue | continue | ||||
else: | else: | ||||
bs = BeautifulSoup(response.text, features="html.parser") | bs = BeautifulSoup(response.text, features="html.parser") | ||||
Show All 23 Lines | ) -> SourceForgeListerPage: | ||||
# See https://sourceforge.net/p/forge/feature-requests/727/ | # See https://sourceforge.net/p/forge/feature-requests/727/ | ||||
url = url.replace("https://", "http://") | url = url.replace("https://", "http://") | ||||
if tool_name == VcsNames.BAZAAR.value: | if tool_name == VcsNames.BAZAAR.value: | ||||
# SourceForge has removed support for bzr and only keeps legacy projects | # SourceForge has removed support for bzr and only keeps legacy projects | ||||
# around at a separate (also not https) URL. Bzr projects are very rare | # around at a separate (also not https) URL. Bzr projects are very rare | ||||
# and a lot of them are 404 now. | # and a lot of them are 404 now. | ||||
url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" | url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" | ||||
try: | try: | ||||
response = self.page_request(url, params={}) | response = self.http_request(url) | ||||
if "To get this branch, use:" not in response.text: | if "To get this branch, use:" not in response.text: | ||||
# If a bzr project has multiple branches, we need to extract their | # If a bzr project has multiple branches, we need to extract their | ||||
# names from the repository landing page and create one listed origin | # names from the repository landing page and create one listed origin | ||||
# per branch | # per branch | ||||
parser = lxml.etree.HTMLParser() | parser = lxml.etree.HTMLParser() | ||||
tree = lxml.etree.fromstring(response.text, parser) | tree = lxml.etree.fromstring(response.text, parser) | ||||
# Get all tds with class 'autcell' | # Get all tds with class 'autcell' | ||||
Show All 32 Lines |