Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9346251
D7879.id28433.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D7879.id28433.diff
View Options
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -14,6 +14,7 @@
from tenacity.before_sleep import before_sleep_log
import xmltodict
+from swh.core.github.utils import GitHubSession
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -53,6 +54,7 @@
use as repository type, plus maven types for the maven loader (tgz, jar)."""
LISTER_NAME = "maven"
+ SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
def __init__(
self,
@@ -98,6 +100,9 @@
)
self.jar_origins: Dict[str, ListedOrigin] = {}
+ self.github_session = GitHubSession(
+ credentials=self.credentials, user_agent=USER_AGENT
+ )
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)
@@ -271,35 +276,56 @@
except xmltodict.expat.ExpatError as error:
logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
+ def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
+ """Retrieve scm origin out of the page information. Only called when type of the
+ page is scm.
+
+ Try and detect an scm. Note that official format is of the form:
+ scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
+ the repo url (without the "scm:type"), so we have to check against the content
+ to extract the type, url properly.
+
+ Raises
+ AssertionError when page['type'] != 'scm'
+
+ Returns
+ ListedOrigin with proper canonical scm url (for github) if any is found,
+ None otherwise.
+ """
+
+ assert page["type"] == "scm"
+ visit_type: Optional[str] = None
+ url: Optional[str] = None
+ origin: Optional[ListedOrigin] = None
+ m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
+
+ scm_type = m_scm.group("type") if m_scm is not None else None
+ if m_scm and scm_type and scm_type in self.SUPPORTED_SCM_TYPES:
+ url = m_scm.group("url")
+ visit_type = scm_type
+ elif page["url"].endswith(".git"):
+ url = page["url"]
+ visit_type = "git"
+
+ if url and visit_type and visit_type == "git":
+ url = self.github_session.get_canonical_url(url)
+
+ if url and visit_type:
+ assert self.lister_obj.id is not None
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type=visit_type,
+ )
+ return origin
+
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+
"""Convert a page of Maven repositories into a list of ListedOrigins."""
- assert self.lister_obj.id is not None
- scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
if page["type"] == "scm":
- # If origin is a scm url: detect scm type and yield.
- # Note that the official format is:
- # scm:git:git://github.com/openengsb/openengsb-framework.git
- # but many, many projects directly put the repo url, so we have to
- # detect the content to match it properly.
- m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
- if m_scm is not None:
- scm_type = m_scm.group("type")
- if scm_type in scm_types_ok:
- scm_url = m_scm.group("url")
- origin = ListedOrigin(
- lister_id=self.lister_obj.id,
- url=scm_url,
- visit_type=scm_type,
- )
- yield origin
- else:
- if page["url"].endswith(".git"):
- origin = ListedOrigin(
- lister_id=self.lister_obj.id,
- url=page["url"],
- visit_type="git",
- )
- yield origin
+ listed_origin = self.get_scm(page)
+ if listed_origin:
+ yield listed_origin
else:
# Origin is gathering source archives:
last_update_dt = None
@@ -326,6 +352,7 @@
if origin_url not in self.jar_origins:
# Create ListedOrigin instance if we did not see that origin yet
+ assert self.lister_obj.id is not None
jar_origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@@ -18,9 +18,12 @@
URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
+USER_REPO = "aldialimucaj/sprova4j"
+GIT_REPO_URL_HTTPS = f"https://github.com/{USER_REPO}.git"
+GIT_REPO_URL_API = f"https://api.github.com/repos/{USER_REPO}"
LIST_GIT = (
- "git://github.com/aldialimucaj/sprova4j.git",
- "https://github.com/aldialimucaj/sprova4j.git",
+ f"git://github.com/{USER_REPO}.git",
+ GIT_REPO_URL_HTTPS,
)
LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
@@ -86,6 +89,16 @@
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
+@pytest.fixture
+def requests_mock():
+ """If github api calls for the configured scm repository, returns its canonical url."""
+ requests_mock.get(
+ GIT_REPO_URL_API,
+ json={"html_url": GIT_REPO_URL_HTTPS},
+ )
+ yield requests_mock
+
+
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:50 PM (2 w, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218026
Attached To
D7879: Adapt maven lister to list canonical gh urls if any
Event Timeline
Log In to Comment