diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -14,6 +14,7 @@ from tenacity.before_sleep import before_sleep_log import xmltodict +from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -53,6 +54,7 @@ use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" + SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") def __init__( self, @@ -98,6 +100,9 @@ ) self.jar_origins: Dict[str, ListedOrigin] = {} + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -271,35 +276,56 @@ except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: + """Retrieve scm origin out of the page information. Only called when type of the + page is scm. + + Try and detect an scm. Note that official format is of the form: + scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put + the repo url (without the "scm:type"), so we have to check against the content + to extract the type, url properly. + + Raises + AssertionError when page['type'] != 'scm' + + Returns + ListedOrigin with proper canonical scm url (for github) if any is found, + None otherwise. + """ + + assert page["type"] == "scm" + visit_type: Optional[str] = None + url: Optional[str] = None + origin: Optional[ListedOrigin] = None + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) + + scm_type = m_scm.group("type") if m_scm is not None else None + if m_scm and scm_type and scm_type in self.SUPPORTED_SCM_TYPES: + url = m_scm.group("url") + visit_type = scm_type + elif page["url"].endswith(".git"): + url = page["url"] + visit_type = "git" + + if url and visit_type and visit_type == "git": + url = self.github_session.get_canonical_url(url) + + if url and visit_type: + assert self.lister_obj.id is not None + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type=visit_type, + ) + return origin + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": - # If origin is a scm url: detect scm type and yield. - # Note that the official format is: - # scm:git:git://github.com/openengsb/openengsb-framework.git - # but many, many projects directly put the repo url, so we have to - # detect the content to match it properly. - m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) - if m_scm is not None: - scm_type = m_scm.group("type") - if scm_type in scm_types_ok: - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=scm_url, - visit_type=scm_type, - ) - yield origin - else: - if page["url"].endswith(".git"): - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type="git", - ) - yield origin + listed_origin = self.get_scm(page) + if listed_origin: + yield listed_origin else: # Origin is gathering source archives: last_update_dt = None @@ -326,6 +352,7 @@ if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet + assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url,