Differential D5659 Diff 20215 swh/lister/sourceforge/lister.py

Changeset View

Standalone View

swh/lister/sourceforge/lister.py

# Copyright (C) 2021 The Software Heritage developers		# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information
from dataclasses import dataclass		from dataclasses import dataclass, field
import datetime		import datetime
from enum import Enum		from enum import Enum
import logging		import logging
import re		import re
from typing import Iterator, List, Set		from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from xml.etree import ElementTree		from xml.etree import ElementTree

import iso8601		import iso8601
import requests		import requests
from tenacity.before_sleep import before_sleep_log		from tenacity.before_sleep import before_sleep_log

		from swh.core.api.classes import stream_results
from swh.lister.utils import throttling_retry		from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface		from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin		from swh.scheduler.model import ListedOrigin

from .. import USER_AGENT		from .. import USER_AGENT
from ..pattern import StatelessLister		from ..pattern import Lister

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)


class VcsNames(Enum):		class VcsNames(Enum):
"""Used to filter SourceForge tool names for valid VCS types"""		"""Used to filter SourceForge tool names for valid VCS types"""

# CVS projects are read-only		# CVS projects are read-only
Show All 9 Lines

@dataclass		@dataclass
class SourceForgeListerEntry:		class SourceForgeListerEntry:
vcs: VcsNames		vcs: VcsNames
url: str		url: str
last_modified: datetime.date		last_modified: datetime.date


		SubSitemapName = str
		ProjectName = str
		LastModified = datetime.date

		douarddaUnsubmitted Not Done Inline Actions at first I though these better be type annotations (instead of affectations), but I was wrong. Maybe postfix them with a T (eg. `ProjectNameT` ) to make it clearer these are actually type aliases? Also, `LastModifed` is really only a date (with no time)? (edit: looks so, according the tests below) douardda: at first I though these better be type annotations (instead of affectations), but I was wrong.
		AlphareAuthorUnsubmitted Done Inline Actions Why would these "better be type annotations"? I'm not overly familiar with Python explicit typing, so I'm happy to learn. I have no problem with postfixing the with a T if that's clearer. `LastModified` is indeed a date since SourceForge only provides us with date granularity. Alphare: Why would these "better be type annotations"? I'm not overly familiar with Python explicit…

		@dataclass
		class SourceForgeListerState:
		"""Current state of the SourceForge lister in incremental runs
		"""

		"""If the subsitemap does not exist, we assume a full run of this subsitemap
		is needed. If the date is the same, we skip the subsitemap, otherwise we
		request the subsitemap and look up every project's "last modified" date
		to compare against `ListedOrigins` from the database."""
		subsitemap_last_modified: Dict[SubSitemapName, LastModified] = field(
		default_factory=dict
		)
		"""Some projects (not the majority, but still meaningful) have no VCS for us to
		archive. We need to remember a mapping of their API URL to their "last modified"
		vlorentzUnsubmitted Done Inline Actions Could you make these comments docstrings, so they show up in the docs? And don't mean they "have no VCS for us"? vlorentz: Could you make these comments docstrings, so they show up in the docs? And don't mean they…
		date so we don't keep querying them needlessly every time."""
		empty_projects: Dict[str, LastModified] = field(default_factory=dict)


SourceForgeListerPage = List[SourceForgeListerEntry]		SourceForgeListerPage = List[SourceForgeListerEntry]

MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"		MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"		SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"

# API resource endpoint for information about the given project.		# API resource endpoint for information about the given project.
#		#
# `namespace`: Project namespace. Very often `p`, but can be something else like		# `namespace`: Project namespace. Very often `p`, but can be something else like
Show All 10 Lines
# `mount_point`: url path used by the repo. For example, the Code::Blocks project uses		# `mount_point`: url path used by the repo. For example, the Code::Blocks project uses
# `git` (https://git.code.sf.net/p/codeblocks/git).		# `git` (https://git.code.sf.net/p/codeblocks/git).
CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}"		CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}"

PROJ_URL_RE = re.compile(		PROJ_URL_RE = re.compile(
r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"		r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"
)		)

		# Mapping of `(namespace, project name)` to `last modified` date.
		ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModified]

class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
		class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
"""List origins from the "SourceForge" forge.		"""List origins from the "SourceForge" forge.

"""		"""

# Part of the lister API, that identifies this lister		# Part of the lister API, that identifies this lister
LISTER_NAME = "sourceforge"		LISTER_NAME = "sourceforge"

def __init__(self, scheduler: SchedulerInterface):		def __init__(self, scheduler: SchedulerInterface, incremental: bool = False):
super().__init__(		super().__init__(
scheduler=scheduler, url="https://sourceforge.net", instance="main"		scheduler=scheduler, url="https://sourceforge.net", instance="main"
)		)

		# Will hold the currently saved "last modified" dates to compare against our
		# requests.
		self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
self.session = requests.Session()		self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list		# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(		self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}		{"Accept": "application/json", "User-Agent": USER_AGENT}
)		)
		self.incremental = incremental

		def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
		subsitemaps = {
		k: datetime.date.fromisoformat(v)
		for k, v in d.get("subsitemap_last_modified", {}).items()
		}
		empty_projects = {
		k: datetime.date.fromisoformat(v)
		for k, v in d.get("empty_projects", {}).items()
		}
		return SourceForgeListerState(
		subsitemap_last_modified=subsitemaps, empty_projects=empty_projects
		)

		def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
		return {
		"subsitemap_last_modified": {
		k: v.isoformat() for k, v in state.subsitemap_last_modified.items()
		},
		"empty_projects": {
		k: v.isoformat() for k, v in state.empty_projects.items()
		},
		}

		def projects_last_modified(self) -> ProjectsLastModifiedCache:
		if not self.incremental:
		# No point in loading the previous results if we're doing a full run
		return {}
		if self._project_last_modified is not None:
		return self._project_last_modified
		# We know there will be at least that many origins
		stream = stream_results(
		self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000
		)
		listed_origins = dict()
		vlorentzUnsubmitted Not Done Inline Actions :D If it's an issue, wouldn't a `dict` with `((namespace, project)` as key and URLs as values perform better? vlorentz: :D If it's an issue, wouldn't a `dict` with `((namespace, project)` as key and URLs as values…
		AlphareAuthorUnsubmitted Done Inline Actions This is clearly a case of trying to be too clever at 10pm on a Friday, I think a dict will work fine, hehe. Alphare: This is clearly a case of trying to be too clever at 10pm on a Friday, I think a dict will work…
		# Projects can have slashes in them if they're subprojects, but the
		# mointpoint (last component) cannot.
		url_match = re.compile(
		r".\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/."
		)
		for origin in stream:
		url = origin.url
		match = url_match.match(url)
		assert match is not None
		matches = match.groupdict()
		namespace = matches["namespace"]
		project = matches["project"]
		# "Last modified" dates are the same across all VCS (tools, even)
		# within a project or subproject. An assertion here would be overkill.
		last_modified = origin.last_update
		assert last_modified is not None
		listed_origins[(namespace, project)] = last_modified.date()

		self._project_last_modified = listed_origins
		return listed_origins

@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))		@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:		def page_request(self, url, params) -> requests.Response:
# Log listed URL to ease debugging		# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)		logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)		response = self.session.get(url, params=params)

if response.status_code != 200:		if response.status_code != 200:
Show All 20 Lines	def get_pages(self) -> Iterator[SourceForgeListerPage]:
the VCS used. Subprojects are considered separate projects.		the VCS used. Subprojects are considered separate projects.
Lastly we use the information of which VCS are used to build the predictable		Lastly we use the information of which VCS are used to build the predictable
clone URL for any given VCS.		clone URL for any given VCS.
"""		"""
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text		sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
tree = ElementTree.fromstring(sitemap_contents)		tree = ElementTree.fromstring(sitemap_contents)

for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):		for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
# TODO use when adding incremental support		last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
# last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")		assert last_modified_el is not None and last_modified_el.text is not None
		last_modified = datetime.date.fromisoformat(last_modified_el.text)
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")		location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None		assert location is not None and location.text is not None
sub_url = location.text		sub_url = location.text

		if self.incremental:
		recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
		if recorded_last_mod == last_modified:
		# The entire subsitemap hasn't changed, so none of its projects
		# have either, skip it.
		continue

		self.state.subsitemap_last_modified[sub_url] = last_modified
subsitemap_contents = self.page_request(sub_url, {}).text		subsitemap_contents = self.page_request(sub_url, {}).text
subtree = ElementTree.fromstring(subsitemap_contents)		subtree = ElementTree.fromstring(subsitemap_contents)

yield from self._get_pages_from_subsitemap(subtree)		yield from self._get_pages_from_subsitemap(subtree)

def get_origins_from_page(		def get_origins_from_page(
self, page: SourceForgeListerPage		self, page: SourceForgeListerPage
) -> Iterator[ListedOrigin]:		) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None		assert self.lister_obj.id is not None
for hit in page:		for hit in page:
yield ListedOrigin(		yield ListedOrigin(
lister_id=self.lister_obj.id,		lister_id=self.lister_obj.id,
visit_type=hit.vcs.value,		visit_type=hit.vcs.value,
url=hit.url,		url=hit.url,
last_update=iso8601.parse_date(hit.last_modified),		last_update=iso8601.parse_date(hit.last_modified),
)		)

def _get_pages_from_subsitemap(		def _get_pages_from_subsitemap(
self, subtree: ElementTree.Element		self, subtree: ElementTree.Element
) -> Iterator[SourceForgeListerPage]:		) -> Iterator[SourceForgeListerPage]:
projects: Set[str] = set()		projects: Set[ProjectName] = set()
for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):		for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")		last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_block is not None		assert last_modified_block is not None
last_modified = last_modified_block.text		last_modified = last_modified_block.text
location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")		location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None		assert location is not None
project_url = location.text		project_url = location.text
assert project_url is not None		assert project_url is not None
Show All 28 Lines	) -> Iterator[SourceForgeListerPage]:
else:		else:
# Should always match, let's log it		# Should always match, let's log it
msg = "Project URL '%s' does not match expected pattern"		msg = "Project URL '%s' does not match expected pattern"
logger.warning(msg, project_url)		logger.warning(msg, project_url)

def _get_pages_for_project(		def _get_pages_for_project(
self, namespace, project, last_modified		self, namespace, project, last_modified
) -> SourceForgeListerPage:		) -> SourceForgeListerPage:
endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)		endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)
		empty_project_last_modified = self.state.empty_projects.get(endpoint)
		if empty_project_last_modified is not None:
		if last_modified == empty_project_last_modified.isoformat():
		# Project has not changed, so is still empty, meaning it has
		# no VCS attached that we can archive.
		vlorentzUnsubmitted Done Inline Actions Could you should repeat here what "empty project" means? vlorentz: Could you should repeat here what "empty project" means?
		logger.debug(f"Project {namespace}/{project} is still empty")
		return []

		if self.incremental:
		expected = self.projects_last_modified().get((namespace, project))

		if expected is not None:
		if expected.isoformat() == last_modified:
		# Project has not changed
		logger.debug(f"Project {namespace}/{project} has not changed")
		return []
		else:
		logger.debug(f"Project {namespace}/{project} was updated")
		else:
		msg = "New project during an incremental run: %s/%s"
		logger.debug(msg, namespace, project)

		vlorentzUnsubmitted Done Inline Actions Couldn't this happen if this is a new project, or if the project added a VCS since the last listing? vlorentz: Couldn't this happen if this is a new project, or if the project added a VCS since the last…
		AlphareAuthorUnsubmitted Done Inline Actions Yep, it could. I logged a less scary message which would still allow us to debug in case of a mistake. Alphare: Yep, it could. I logged a less scary message which would still allow us to debug in case of a…
res = self.page_request(endpoint, {}).json()		res = self.page_request(endpoint, {}).json()

tools = res.get("tools")		tools = res.get("tools")
if tools is None:		if tools is None:
# This probably never happens		# This probably never happens
logger.warning("Project '%s' does not have any tools", endpoint)		logger.warning("Project '%s' does not have any tools", endpoint)
return []		return []

hits = []		hits = []
for tool in tools:		for tool in tools:
tool_name = tool["name"]		tool_name = tool["name"]
if tool_name not in VCS_NAMES:		if tool_name not in VCS_NAMES:
continue		continue
url = CLONE_URL_FORMAT.format(		url = CLONE_URL_FORMAT.format(
vcs=tool_name,		vcs=tool_name,
namespace=namespace,		namespace=namespace,
project=project,		project=project,
mount_point=tool["mount_point"],		mount_point=tool["mount_point"],
)		)
entry = SourceForgeListerEntry(		entry = SourceForgeListerEntry(
vcs=VcsNames(tool_name), url=url, last_modified=last_modified		vcs=VcsNames(tool_name), url=url, last_modified=last_modified
)		)
hits.append(entry)		hits.append(entry)

		if not hits:
		date = datetime.date.fromisoformat(last_modified)
		self.state.empty_projects[endpoint] = date
		else:
		self.state.empty_projects.pop(endpoint, None)

return hits		return hits