Page MenuHomeSoftware Heritage

D7879.id28433.diff
No OneTemporary

D7879.id28433.diff

diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -14,6 +14,7 @@
from tenacity.before_sleep import before_sleep_log
import xmltodict
+from swh.core.github.utils import GitHubSession
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -53,6 +54,7 @@
use as repository type, plus maven types for the maven loader (tgz, jar)."""
LISTER_NAME = "maven"
+ SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
def __init__(
self,
@@ -98,6 +100,9 @@
)
self.jar_origins: Dict[str, ListedOrigin] = {}
+ self.github_session = GitHubSession(
+ credentials=self.credentials, user_agent=USER_AGENT
+ )
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)
@@ -271,35 +276,56 @@
except xmltodict.expat.ExpatError as error:
logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
+ def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
+ """Retrieve scm origin out of the page information. Only called when type of the
+ page is scm.
+
+ Try and detect an scm. Note that official format is of the form:
+ scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
+ the repo url (without the "scm:type"), so we have to check against the content
+ to extract the type, url properly.
+
+ Raises
+ AssertionError when page['type'] != 'scm'
+
+ Returns
+ ListedOrigin with proper canonical scm url (for github) if any is found,
+ None otherwise.
+ """
+
+ assert page["type"] == "scm"
+ visit_type: Optional[str] = None
+ url: Optional[str] = None
+ origin: Optional[ListedOrigin] = None
+ m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
+
+ scm_type = m_scm.group("type") if m_scm is not None else None
+ if m_scm and scm_type and scm_type in self.SUPPORTED_SCM_TYPES:
+ url = m_scm.group("url")
+ visit_type = scm_type
+ elif page["url"].endswith(".git"):
+ url = page["url"]
+ visit_type = "git"
+
+ if url and visit_type and visit_type == "git":
+ url = self.github_session.get_canonical_url(url)
+
+ if url and visit_type:
+ assert self.lister_obj.id is not None
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type=visit_type,
+ )
+ return origin
+
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+
"""Convert a page of Maven repositories into a list of ListedOrigins."""
- assert self.lister_obj.id is not None
- scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
if page["type"] == "scm":
- # If origin is a scm url: detect scm type and yield.
- # Note that the official format is:
- # scm:git:git://github.com/openengsb/openengsb-framework.git
- # but many, many projects directly put the repo url, so we have to
- # detect the content to match it properly.
- m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
- if m_scm is not None:
- scm_type = m_scm.group("type")
- if scm_type in scm_types_ok:
- scm_url = m_scm.group("url")
- origin = ListedOrigin(
- lister_id=self.lister_obj.id,
- url=scm_url,
- visit_type=scm_type,
- )
- yield origin
- else:
- if page["url"].endswith(".git"):
- origin = ListedOrigin(
- lister_id=self.lister_obj.id,
- url=page["url"],
- visit_type="git",
- )
- yield origin
+ listed_origin = self.get_scm(page)
+ if listed_origin:
+ yield listed_origin
else:
# Origin is gathering source archives:
last_update_dt = None
@@ -326,6 +352,7 @@
if origin_url not in self.jar_origins:
# Create ListedOrigin instance if we did not see that origin yet
+ assert self.lister_obj.id is not None
jar_origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@@ -18,9 +18,12 @@
URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
+USER_REPO = "aldialimucaj/sprova4j"
+GIT_REPO_URL_HTTPS = f"https://github.com/{USER_REPO}.git"
+GIT_REPO_URL_API = f"https://api.github.com/repos/{USER_REPO}"
LIST_GIT = (
- "git://github.com/aldialimucaj/sprova4j.git",
- "https://github.com/aldialimucaj/sprova4j.git",
+ f"git://github.com/{USER_REPO}.git",
+ GIT_REPO_URL_HTTPS,
)
LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
@@ -86,6 +89,16 @@
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
+@pytest.fixture
+def requests_mock():
+ """If github api calls for the configured scm repository, returns its canonical url."""
+ requests_mock.get(
+ GIT_REPO_URL_API,
+ json={"html_url": GIT_REPO_URL_HTTPS},
+ )
+ yield requests_mock
+
+
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:50 PM (2 w, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218026

Event Timeline