Changeset View
Standalone View
swh/lister/sourceforge/lister.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | from dataclasses import dataclass, field | ||||
import datetime | import datetime | ||||
from enum import Enum | from enum import Enum | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Iterator, List, Set | from typing import Any, Dict, Iterator, List, Optional, Set, Tuple | ||||
from xml.etree import ElementTree | from xml.etree import ElementTree | ||||
import iso8601 | import iso8601 | ||||
import requests | import requests | ||||
from tenacity.before_sleep import before_sleep_log | from tenacity.before_sleep import before_sleep_log | ||||
from swh.core.api.classes import stream_results | |||||
from swh.lister.utils import throttling_retry | from swh.lister.utils import throttling_retry | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import StatelessLister | from ..pattern import Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class VcsNames(Enum): | class VcsNames(Enum): | ||||
"""Used to filter SourceForge tool names for valid VCS types""" | """Used to filter SourceForge tool names for valid VCS types""" | ||||
# CVS projects are read-only | # CVS projects are read-only | ||||
Show All 9 Lines | |||||
@dataclass | @dataclass | ||||
class SourceForgeListerEntry: | class SourceForgeListerEntry: | ||||
vcs: VcsNames | vcs: VcsNames | ||||
url: str | url: str | ||||
last_modified: datetime.date | last_modified: datetime.date | ||||
SubSitemapName = str | |||||
ProjectName = str | |||||
LastModified = datetime.date | |||||
douardda: at first I though these better be type annotations (instead of affectations), but I was wrong. | |||||
AlphareAuthorUnsubmitted Done Inline ActionsWhy would these "better be type annotations"? I'm not overly familiar with Python explicit typing, so I'm happy to learn. Alphare: Why would these "better be type annotations"? I'm not overly familiar with Python explicit… | |||||
@dataclass | |||||
class SourceForgeListerState: | |||||
"""Current state of the SourceForge lister in incremental runs | |||||
""" | |||||
"""If the subsitemap does not exist, we assume a full run of this subsitemap | |||||
is needed. If the date is the same, we skip the subsitemap, otherwise we | |||||
request the subsitemap and look up every project's "last modified" date | |||||
to compare against `ListedOrigins` from the database.""" | |||||
subsitemap_last_modified: Dict[SubSitemapName, LastModified] = field( | |||||
default_factory=dict | |||||
) | |||||
"""Some projects (not the majority, but still meaningful) have no VCS for us to | |||||
archive. We need to remember a mapping of their API URL to their "last modified" | |||||
Done Inline ActionsCould you make these comments docstrings, so they show up in the docs? And don't mean they "have no VCS for us"? vlorentz: Could you make these comments docstrings, so they show up in the docs?
And don't mean they… | |||||
date so we don't keep querying them needlessly every time.""" | |||||
empty_projects: Dict[str, LastModified] = field(default_factory=dict) | |||||
SourceForgeListerPage = List[SourceForgeListerEntry] | SourceForgeListerPage = List[SourceForgeListerEntry] | ||||
MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" | MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" | ||||
SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" | SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" | ||||
# API resource endpoint for information about the given project. | # API resource endpoint for information about the given project. | ||||
# | # | ||||
# `namespace`: Project namespace. Very often `p`, but can be something else like | # `namespace`: Project namespace. Very often `p`, but can be something else like | ||||
Show All 10 Lines | |||||
# `mount_point`: url path used by the repo. For example, the Code::Blocks project uses | # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses | ||||
# `git` (https://git.code.sf.net/p/codeblocks/git). | # `git` (https://git.code.sf.net/p/codeblocks/git). | ||||
CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" | CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" | ||||
PROJ_URL_RE = re.compile( | PROJ_URL_RE = re.compile( | ||||
r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?" | r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?" | ||||
) | ) | ||||
# Mapping of `(namespace, project name)` to `last modified` date. | |||||
ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModified] | |||||
class SourceForgeLister(StatelessLister[SourceForgeListerPage]): | |||||
class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): | |||||
"""List origins from the "SourceForge" forge. | """List origins from the "SourceForge" forge. | ||||
""" | """ | ||||
# Part of the lister API, that identifies this lister | # Part of the lister API, that identifies this lister | ||||
LISTER_NAME = "sourceforge" | LISTER_NAME = "sourceforge" | ||||
def __init__(self, scheduler: SchedulerInterface): | def __init__(self, scheduler: SchedulerInterface, incremental: bool = False): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, url="https://sourceforge.net", instance="main" | scheduler=scheduler, url="https://sourceforge.net", instance="main" | ||||
) | ) | ||||
# Will hold the currently saved "last modified" dates to compare against our | |||||
# requests. | |||||
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None | |||||
self.session = requests.Session() | self.session = requests.Session() | ||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list | # Declare the USER_AGENT is more sysadm-friendly for the forge we list | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | {"Accept": "application/json", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
self.incremental = incremental | |||||
def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: | |||||
subsitemaps = { | |||||
k: datetime.date.fromisoformat(v) | |||||
for k, v in d.get("subsitemap_last_modified", {}).items() | |||||
} | |||||
empty_projects = { | |||||
k: datetime.date.fromisoformat(v) | |||||
for k, v in d.get("empty_projects", {}).items() | |||||
} | |||||
return SourceForgeListerState( | |||||
subsitemap_last_modified=subsitemaps, empty_projects=empty_projects | |||||
) | |||||
def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]: | |||||
return { | |||||
"subsitemap_last_modified": { | |||||
k: v.isoformat() for k, v in state.subsitemap_last_modified.items() | |||||
}, | |||||
"empty_projects": { | |||||
k: v.isoformat() for k, v in state.empty_projects.items() | |||||
}, | |||||
} | |||||
def projects_last_modified(self) -> ProjectsLastModifiedCache: | |||||
if not self.incremental: | |||||
# No point in loading the previous results if we're doing a full run | |||||
return {} | |||||
if self._project_last_modified is not None: | |||||
return self._project_last_modified | |||||
# We know there will be at least that many origins | |||||
stream = stream_results( | |||||
self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000 | |||||
) | |||||
listed_origins = dict() | |||||
Not Done Inline Actions:D If it's an issue, wouldn't a dict with ((namespace, project) as key and URLs as values perform better? vlorentz: :D
If it's an issue, wouldn't a `dict` with `((namespace, project)` as key and URLs as values… | |||||
Done Inline ActionsThis is clearly a case of trying to be too clever at 10pm on a Friday, I think a dict will work fine, hehe. Alphare: This is clearly a case of trying to be too clever at 10pm on a Friday, I think a dict will work… | |||||
# Projects can have slashes in them if they're subprojects, but the | |||||
# mointpoint (last component) cannot. | |||||
url_match = re.compile( | |||||
r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*" | |||||
) | |||||
for origin in stream: | |||||
url = origin.url | |||||
match = url_match.match(url) | |||||
assert match is not None | |||||
matches = match.groupdict() | |||||
namespace = matches["namespace"] | |||||
project = matches["project"] | |||||
# "Last modified" dates are the same across all VCS (tools, even) | |||||
# within a project or subproject. An assertion here would be overkill. | |||||
last_modified = origin.last_update | |||||
assert last_modified is not None | |||||
listed_origins[(namespace, project)] = last_modified.date() | |||||
self._project_last_modified = listed_origins | |||||
return listed_origins | |||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | ||||
def page_request(self, url, params) -> requests.Response: | def page_request(self, url, params) -> requests.Response: | ||||
# Log listed URL to ease debugging | # Log listed URL to ease debugging | ||||
logger.debug("Fetching URL %s with params %s", url, params) | logger.debug("Fetching URL %s with params %s", url, params) | ||||
response = self.session.get(url, params=params) | response = self.session.get(url, params=params) | ||||
if response.status_code != 200: | if response.status_code != 200: | ||||
Show All 20 Lines | def get_pages(self) -> Iterator[SourceForgeListerPage]: | ||||
the VCS used. Subprojects are considered separate projects. | the VCS used. Subprojects are considered separate projects. | ||||
Lastly we use the information of which VCS are used to build the predictable | Lastly we use the information of which VCS are used to build the predictable | ||||
clone URL for any given VCS. | clone URL for any given VCS. | ||||
""" | """ | ||||
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text | sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text | ||||
tree = ElementTree.fromstring(sitemap_contents) | tree = ElementTree.fromstring(sitemap_contents) | ||||
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): | for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): | ||||
# TODO use when adding incremental support | last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | ||||
# last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | assert last_modified_el is not None and last_modified_el.text is not None | ||||
last_modified = datetime.date.fromisoformat(last_modified_el.text) | |||||
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") | location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") | ||||
assert location is not None | assert location is not None and location.text is not None | ||||
sub_url = location.text | sub_url = location.text | ||||
if self.incremental: | |||||
recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) | |||||
if recorded_last_mod == last_modified: | |||||
# The entire subsitemap hasn't changed, so none of its projects | |||||
# have either, skip it. | |||||
continue | |||||
self.state.subsitemap_last_modified[sub_url] = last_modified | |||||
subsitemap_contents = self.page_request(sub_url, {}).text | subsitemap_contents = self.page_request(sub_url, {}).text | ||||
subtree = ElementTree.fromstring(subsitemap_contents) | subtree = ElementTree.fromstring(subsitemap_contents) | ||||
yield from self._get_pages_from_subsitemap(subtree) | yield from self._get_pages_from_subsitemap(subtree) | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, page: SourceForgeListerPage | self, page: SourceForgeListerPage | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for hit in page: | for hit in page: | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type=hit.vcs.value, | visit_type=hit.vcs.value, | ||||
url=hit.url, | url=hit.url, | ||||
last_update=iso8601.parse_date(hit.last_modified), | last_update=iso8601.parse_date(hit.last_modified), | ||||
) | ) | ||||
def _get_pages_from_subsitemap( | def _get_pages_from_subsitemap( | ||||
self, subtree: ElementTree.Element | self, subtree: ElementTree.Element | ||||
) -> Iterator[SourceForgeListerPage]: | ) -> Iterator[SourceForgeListerPage]: | ||||
projects: Set[str] = set() | projects: Set[ProjectName] = set() | ||||
for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): | for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): | ||||
last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") | ||||
assert last_modified_block is not None | assert last_modified_block is not None | ||||
last_modified = last_modified_block.text | last_modified = last_modified_block.text | ||||
location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") | location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") | ||||
assert location is not None | assert location is not None | ||||
project_url = location.text | project_url = location.text | ||||
assert project_url is not None | assert project_url is not None | ||||
Show All 28 Lines | ) -> Iterator[SourceForgeListerPage]: | ||||
else: | else: | ||||
# Should always match, let's log it | # Should always match, let's log it | ||||
msg = "Project URL '%s' does not match expected pattern" | msg = "Project URL '%s' does not match expected pattern" | ||||
logger.warning(msg, project_url) | logger.warning(msg, project_url) | ||||
def _get_pages_for_project( | def _get_pages_for_project( | ||||
self, namespace, project, last_modified | self, namespace, project, last_modified | ||||
) -> SourceForgeListerPage: | ) -> SourceForgeListerPage: | ||||
endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) | endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) | ||||
empty_project_last_modified = self.state.empty_projects.get(endpoint) | |||||
if empty_project_last_modified is not None: | |||||
if last_modified == empty_project_last_modified.isoformat(): | |||||
# Project has not changed, so is still empty, meaning it has | |||||
# no VCS attached that we can archive. | |||||
Done Inline ActionsCould you should repeat here what "empty project" means? vlorentz: Could you should repeat here what "empty project" means? | |||||
logger.debug(f"Project {namespace}/{project} is still empty") | |||||
return [] | |||||
if self.incremental: | |||||
expected = self.projects_last_modified().get((namespace, project)) | |||||
if expected is not None: | |||||
if expected.isoformat() == last_modified: | |||||
# Project has not changed | |||||
logger.debug(f"Project {namespace}/{project} has not changed") | |||||
return [] | |||||
else: | |||||
logger.debug(f"Project {namespace}/{project} was updated") | |||||
else: | |||||
msg = "New project during an incremental run: %s/%s" | |||||
logger.debug(msg, namespace, project) | |||||
Done Inline ActionsCouldn't this happen if this is a new project, or if the project added a VCS since the last listing? vlorentz: Couldn't this happen if this is a new project, or if the project added a VCS since the last… | |||||
Done Inline ActionsYep, it could. I logged a less scary message which would still allow us to debug in case of a mistake. Alphare: Yep, it could. I logged a less scary message which would still allow us to debug in case of a… | |||||
res = self.page_request(endpoint, {}).json() | res = self.page_request(endpoint, {}).json() | ||||
tools = res.get("tools") | tools = res.get("tools") | ||||
if tools is None: | if tools is None: | ||||
# This probably never happens | # This probably never happens | ||||
logger.warning("Project '%s' does not have any tools", endpoint) | logger.warning("Project '%s' does not have any tools", endpoint) | ||||
return [] | return [] | ||||
hits = [] | hits = [] | ||||
for tool in tools: | for tool in tools: | ||||
tool_name = tool["name"] | tool_name = tool["name"] | ||||
if tool_name not in VCS_NAMES: | if tool_name not in VCS_NAMES: | ||||
continue | continue | ||||
url = CLONE_URL_FORMAT.format( | url = CLONE_URL_FORMAT.format( | ||||
vcs=tool_name, | vcs=tool_name, | ||||
namespace=namespace, | namespace=namespace, | ||||
project=project, | project=project, | ||||
mount_point=tool["mount_point"], | mount_point=tool["mount_point"], | ||||
) | ) | ||||
entry = SourceForgeListerEntry( | entry = SourceForgeListerEntry( | ||||
vcs=VcsNames(tool_name), url=url, last_modified=last_modified | vcs=VcsNames(tool_name), url=url, last_modified=last_modified | ||||
) | ) | ||||
hits.append(entry) | hits.append(entry) | ||||
if not hits: | |||||
date = datetime.date.fromisoformat(last_modified) | |||||
self.state.empty_projects[endpoint] = date | |||||
else: | |||||
self.state.empty_projects.pop(endpoint, None) | |||||
return hits | return hits |
at first I though these better be type annotations (instead of affectations), but I was wrong. Maybe postfix them with a T (eg. ProjectNameT ) to make it clearer these are actually type aliases?
Also, LastModifed is really only a date (with no time)? (edit: looks so, according the tests below)