diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db,github] >= 2.6 +swh.core[db,github] >= 2.8 swh.scheduler >= 0.8 diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -14,6 +14,7 @@ from tenacity.before_sleep import before_sleep_log import xmltodict +from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -53,6 +54,7 @@ use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" + SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") def __init__( self, @@ -98,6 +100,9 @@ ) self.jar_origins: Dict[str, ListedOrigin] = {} + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -271,35 +276,56 @@ except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: + """Retrieve scm origin out of the page information. Only called when type of the + page is scm. + + Try and detect an scm. Note that official format is of the form: + scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put + the repo url (without the "scm:type"), so we have to check against the content + to extract the type, url properly. + + Raises + AssertionError when page['type'] != 'scm' + + Returns + ListedOrigin with proper canonical scm url (for github) if any is found, + None otherwise. + """ + + assert page["type"] == "scm" + visit_type: Optional[str] = None + url: Optional[str] = None + origin: Optional[ListedOrigin] = None + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) + + scm_type = m_scm.group("type") if m_scm is not None else None + if m_scm and scm_type and scm_type in self.SUPPORTED_SCM_TYPES: + url = m_scm.group("url") + visit_type = scm_type + elif page["url"].endswith(".git"): + url = page["url"].lstrip("scm:") + visit_type = "git" + + if url and visit_type and visit_type == "git": + url = self.github_session.get_canonical_url(url) + + if url and visit_type: + assert self.lister_obj.id is not None + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type=visit_type, + ) + return origin + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": - # If origin is a scm url: detect scm type and yield. - # Note that the official format is: - # scm:git:git://github.com/openengsb/openengsb-framework.git - # but many, many projects directly put the repo url, so we have to - # detect the content to match it properly. - m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) - if m_scm is not None: - scm_type = m_scm.group("type") - if scm_type in scm_types_ok: - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=scm_url, - visit_type=scm_type, - ) - yield origin - else: - if page["url"].endswith(".git"): - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type="git", - ) - yield origin + listed_origin = self.get_scm(page) + if listed_origin: + yield listed_origin else: # Origin is gathering source archives: last_update_dt = None @@ -326,6 +352,7 @@ if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet + assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -18,12 +18,17 @@ URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" -LIST_GIT = ( - "git://github.com/aldialimucaj/sprova4j.git", - "https://github.com/aldialimucaj/sprova4j.git", -) -LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) +USER_REPO0 = "aldialimucaj/sprova4j" +GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" +GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" +LIST_GIT = (GIT_REPO_URL0_HTTPS,) + +USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" +GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" +GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" +GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" +LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) @@ -86,6 +91,20 @@ return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def requests_mock(requests_mock): + """If github api calls for the configured scm repository, returns its canonical url.""" + for url_api, url_html in [ + (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), + (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + ]: + requests_mock.get( + url_api, + json={"html_url": url_html}, + ) + yield requests_mock + + @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 @@ -118,7 +137,7 @@ origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) - assert len(origin_urls) == 4 + assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: @@ -164,7 +183,7 @@ # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 - assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": @@ -212,7 +231,7 @@ # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 - assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven":