Differential D8520 Diff 30740 swh/lister/sourceforge/lister.py

Changeset View

Standalone View

swh/lister/sourceforge/lister.py

Show All 9 Lines
import re		import re
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple		from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from xml.etree import ElementTree		from xml.etree import ElementTree

from bs4 import BeautifulSoup		from bs4 import BeautifulSoup
import iso8601		import iso8601
import lxml		import lxml
import requests		import requests
from tenacity.before_sleep import before_sleep_log

from swh.core.api.classes import stream_results		from swh.core.api.classes import stream_results
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface		from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin		from swh.scheduler.model import ListedOrigin

from .. import USER_AGENT
from ..pattern import CredentialsType, Lister		from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)


class VcsNames(Enum):		class VcsNames(Enum):
"""Used to filter SourceForge tool names for valid VCS types"""		"""Used to filter SourceForge tool names for valid VCS types"""

▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	):
url="https://sourceforge.net",		url="https://sourceforge.net",
instance="main",		instance="main",
credentials=credentials,		credentials=credentials,
)		)

# Will hold the currently saved "last modified" dates to compare against our		# Will hold the currently saved "last modified" dates to compare against our
# requests.		# requests.
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None		self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list		self.session.headers.update({"Accept": "application/json"})
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.incremental = incremental		self.incremental = incremental

def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:		def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
subsitemaps = {		subsitemaps = {
k: datetime.date.fromisoformat(v)		k: datetime.date.fromisoformat(v)
for k, v in d.get("subsitemap_last_modified", {}).items()		for k, v in d.get("subsitemap_last_modified", {}).items()
}		}
empty_projects = {		empty_projects = {
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	def projects_last_modified(self) -> ProjectsLastModifiedCache:
# within a project or subproject. An assertion here would be overkill.		# within a project or subproject. An assertion here would be overkill.
last_modified = origin.last_update		last_modified = origin.last_update
assert last_modified is not None		assert last_modified is not None
listed_origins[(namespace, project)] = last_modified.date()		listed_origins[(namespace, project)] = last_modified.date()

self._project_last_modified = listed_origins		self._project_last_modified = listed_origins
return listed_origins		return listed_origins

@http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def page_request(self, url, params) -> requests.Response:
# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)

if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s for URL %s",
response.status_code,
response.url,
)
# The lister must fail on blocking errors
response.raise_for_status()

return response

def get_pages(self) -> Iterator[SourceForgeListerPage]:		def get_pages(self) -> Iterator[SourceForgeListerPage]:
"""		"""
SourceForge has a main XML sitemap that lists its sharded sitemaps for all		SourceForge has a main XML sitemap that lists its sharded sitemaps for all
projects.		projects.
Each XML sub-sitemap lists project pages, which are not unique per project: a		Each XML sub-sitemap lists project pages, which are not unique per project: a
project can have a wiki, a home, a git, an svn, etc.		project can have a wiki, a home, a git, an svn, etc.
For each unique project, we query an API endpoint that lists (among		For each unique project, we query an API endpoint that lists (among
other things) the tools associated with said project, some of which are		other things) the tools associated with said project, some of which are
the VCS used. Subprojects are considered separate projects.		the VCS used. Subprojects are considered separate projects.
Lastly we use the information of which VCS are used to build the predictable		Lastly we use the information of which VCS are used to build the predictable
clone URL for any given VCS.		clone URL for any given VCS.
"""		"""
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text		sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text
tree = ElementTree.fromstring(sitemap_contents)		tree = ElementTree.fromstring(sitemap_contents)

for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):		for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")		last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_el is not None and last_modified_el.text is not None		assert last_modified_el is not None and last_modified_el.text is not None
last_modified = datetime.date.fromisoformat(last_modified_el.text)		last_modified = datetime.date.fromisoformat(last_modified_el.text)
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")		location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None and location.text is not None		assert location is not None and location.text is not None
sub_url = location.text		sub_url = location.text

if self.incremental:		if self.incremental:
recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)		recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
if recorded_last_mod == last_modified:		if recorded_last_mod == last_modified:
# The entire subsitemap hasn't changed, so none of its projects		# The entire subsitemap hasn't changed, so none of its projects
# have either, skip it.		# have either, skip it.
continue		continue

self.state.subsitemap_last_modified[sub_url] = last_modified		self.state.subsitemap_last_modified[sub_url] = last_modified
subsitemap_contents = self.page_request(sub_url, {}).text		subsitemap_contents = self.http_request(sub_url).text
subtree = ElementTree.fromstring(subsitemap_contents)		subtree = ElementTree.fromstring(subsitemap_contents)

yield from self._get_pages_from_subsitemap(subtree)		yield from self._get_pages_from_subsitemap(subtree)

def get_origins_from_page(		def get_origins_from_page(
self, page: SourceForgeListerPage		self, page: SourceForgeListerPage
) -> Iterator[ListedOrigin]:		) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None		assert self.lister_obj.id is not None
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	) -> SourceForgeListerPage:
return []		return []
else:		else:
logger.debug(f"Project {namespace}/{project} was updated")		logger.debug(f"Project {namespace}/{project} was updated")
else:		else:
msg = "New project during an incremental run: %s/%s"		msg = "New project during an incremental run: %s/%s"
logger.debug(msg, namespace, project)		logger.debug(msg, namespace, project)

try:		try:
res = self.page_request(endpoint, {}).json()		res = self.http_request(endpoint).json()
except requests.HTTPError:		except requests.HTTPError:
# We've already logged in `page_request`		# We've already logged in `http_request`
return []		return []

tools = res.get("tools")		tools = res.get("tools")
if tools is None:		if tools is None:
# This rarely happens, on very old URLs		# This rarely happens, on very old URLs
logger.warning("Project '%s' does not have any tools", endpoint)		logger.warning("Project '%s' does not have any tools", endpoint)
return []		return []

hits = []		hits = []
for tool in tools:		for tool in tools:
tool_name = tool["name"]		tool_name = tool["name"]
if tool_name not in VCS_NAMES:		if tool_name not in VCS_NAMES:
continue		continue
if tool_name == VcsNames.CVS.value:		if tool_name == VcsNames.CVS.value:
# CVS projects are different from other VCS ones, they use the rsync		# CVS projects are different from other VCS ones, they use the rsync
# protocol, a list of modules needs to be fetched from an info page		# protocol, a list of modules needs to be fetched from an info page
# and multiple origin URLs can be produced for a same project.		# and multiple origin URLs can be produced for a same project.
cvs_info_url = f"http://{project}.cvs.sourceforge.net"		cvs_info_url = f"http://{project}.cvs.sourceforge.net"
try:		try:
response = self.page_request(cvs_info_url, params={})		response = self.http_request(cvs_info_url)
except requests.HTTPError:		except requests.HTTPError:
logger.warning(		logger.warning(
"CVS info page could not be fetched, skipping project '%s'",		"CVS info page could not be fetched, skipping project '%s'",
project,		project,
)		)
continue		continue
else:		else:
bs = BeautifulSoup(response.text, features="html.parser")		bs = BeautifulSoup(response.text, features="html.parser")
Show All 23 Lines	) -> SourceForgeListerPage:
# See https://sourceforge.net/p/forge/feature-requests/727/		# See https://sourceforge.net/p/forge/feature-requests/727/
url = url.replace("https://", "http://")		url = url.replace("https://", "http://")
if tool_name == VcsNames.BAZAAR.value:		if tool_name == VcsNames.BAZAAR.value:
# SourceForge has removed support for bzr and only keeps legacy projects		# SourceForge has removed support for bzr and only keeps legacy projects
# around at a separate (also not https) URL. Bzr projects are very rare		# around at a separate (also not https) URL. Bzr projects are very rare
# and a lot of them are 404 now.		# and a lot of them are 404 now.
url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"		url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"
try:		try:
response = self.page_request(url, params={})		response = self.http_request(url)
if "To get this branch, use:" not in response.text:		if "To get this branch, use:" not in response.text:
# If a bzr project has multiple branches, we need to extract their		# If a bzr project has multiple branches, we need to extract their
# names from the repository landing page and create one listed origin		# names from the repository landing page and create one listed origin
# per branch		# per branch
parser = lxml.etree.HTMLParser()		parser = lxml.etree.HTMLParser()
tree = lxml.etree.fromstring(response.text, parser)		tree = lxml.etree.fromstring(response.text, parser)

# Get all tds with class 'autcell'		# Get all tds with class 'autcell'
Show All 32 Lines