Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nuget/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||
from dataclasses import dataclass | |||||||||||||
from datetime import datetime | |||||||||||||
import logging | import logging | ||||||||||||
from typing import Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||||||||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||||||||||
import iso8601 | |||||||||||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||||||||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | ||||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | ||||||||||||
NugetListerPage = List[Dict[str, str]] | NugetListerPage = List[Dict[str, str]] | ||||||||||||
class NugetLister(StatelessLister[NugetListerPage]): | @dataclass | ||||||||||||
class NugetListerState: | |||||||||||||
"""Store lister state for incremental mode operations""" | |||||||||||||
last_listing_date: Optional[datetime] = None | |||||||||||||
"""Last date from main http api endpoint when lister was executed""" | |||||||||||||
class NugetLister(Lister[NugetListerState, NugetListerPage]): | |||||||||||||
"""List Nuget (Package manager for .NET) origins.""" | """List Nuget (Package manager for .NET) origins.""" | ||||||||||||
LISTER_NAME = "nuget" | LISTER_NAME = "nuget" | ||||||||||||
INSTANCE = "nuget" | INSTANCE = "nuget" | ||||||||||||
API_INDEX_URL = "https://api.nuget.org/v3/catalog0/index.json" | API_INDEX_URL = "https://api.nuget.org/v3/catalog0/index.json" | ||||||||||||
def __init__( | def __init__( | ||||||||||||
self, | self, | ||||||||||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||||||||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||||||||||
): | ): | ||||||||||||
super().__init__( | super().__init__( | ||||||||||||
scheduler=scheduler, | scheduler=scheduler, | ||||||||||||
credentials=credentials, | credentials=credentials, | ||||||||||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||||||||||
url=self.API_INDEX_URL, | url=self.API_INDEX_URL, | ||||||||||||
) | ) | ||||||||||||
self.listing_date: Optional[datetime] = None | |||||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState: | |||||||||||||
last_listing_date = d.get("last_listing_date") | |||||||||||||
if last_listing_date is not None: | |||||||||||||
d["last_listing_date"] = iso8601.parse_date(last_listing_date) | |||||||||||||
return NugetListerState(**d) | |||||||||||||
def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]: | |||||||||||||
d: Dict[str, Optional[str]] = {"last_listing_date": None} | |||||||||||||
last_listing_date = state.last_listing_date | |||||||||||||
if last_listing_date is not None: | |||||||||||||
d["last_listing_date"] = last_listing_date.isoformat() | |||||||||||||
return d | |||||||||||||
def get_pages(self) -> Iterator[NugetListerPage]: | def get_pages(self) -> Iterator[NugetListerPage]: | ||||||||||||
"""Yield an iterator which returns 'page' | """Yield an iterator which returns 'page' | ||||||||||||
It uses the following endpoint `https://api.nuget.org/v3/catalog0/index.json` | It uses the following endpoint `https://api.nuget.org/v3/catalog0/index.json` | ||||||||||||
to get a list of pages endpoint to iterate. | to get a list of pages endpoint to iterate. | ||||||||||||
""" | """ | ||||||||||||
index_response = self.http_request(url=self.url) | index_response = self.http_request(url=self.url) | ||||||||||||
index = index_response.json() | index = index_response.json() | ||||||||||||
assert "items" in index | |||||||||||||
assert "commitTimeStamp" in index | |||||||||||||
vlorentz: unless it can be absent (in which case you need to do something with it instead of passing it… | |||||||||||||
self.listing_date = iso8601.parse_date(index["commitTimeStamp"]) | |||||||||||||
assert "items" in index | |||||||||||||
for page in index["items"]: | for page in index["items"]: | ||||||||||||
assert page["@id"] | assert page["@id"] | ||||||||||||
assert page["commitTimeStamp"] | |||||||||||||
commit_timestamp = iso8601.parse_date(page["commitTimeStamp"]) | |||||||||||||
if ( | |||||||||||||
not self.state.last_listing_date | |||||||||||||
Done Inline Actions
(not A) or (A and B) is equivalent to (not A) or B vlorentz: `(not A) or (A and B)` is equivalent to `(not A) or B` | |||||||||||||
or commit_timestamp > self.state.last_listing_date | |||||||||||||
): | |||||||||||||
try: | try: | ||||||||||||
page_response = self.http_request(url=page["@id"]) | page_response = self.http_request(url=page["@id"]) | ||||||||||||
page_data = page_response.json() | page_data = page_response.json() | ||||||||||||
assert "items" in page_data | assert "items" in page_data | ||||||||||||
yield page_data["items"] | yield page_data["items"] | ||||||||||||
except HTTPError: | except HTTPError: | ||||||||||||
logger.warning( | logger.warning( | ||||||||||||
"Failed to fetch page %s, skipping it from listing.", | "Failed to fetch page %s, skipping it from listing.", | ||||||||||||
page["@id"], | page["@id"], | ||||||||||||
) | ) | ||||||||||||
continue | continue | ||||||||||||
def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]: | ||||||||||||
"""Iterate on all pages and yield ListedOrigin instances. | """Iterate on all pages and yield ListedOrigin instances. | ||||||||||||
.NET packages are binary, dll, etc. We retrieve only packages for which we can | .NET packages are binary, dll, etc. We retrieve only packages for which we can | ||||||||||||
find a vcs repository. | find a vcs repository. | ||||||||||||
To check if a vcs repository exists, we need for each entry in a page to retrieve | To check if a vcs repository exists, we need for each entry in a page to retrieve | ||||||||||||
a .nuspec file, which is a package metadata xml file, and search for a `repository` | a .nuspec file, which is a package metadata xml file, and search for a `repository` | ||||||||||||
Show All 12 Lines | def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]: | ||||||||||||
continue | continue | ||||||||||||
data = res.json() | data = res.json() | ||||||||||||
pkgname = data["id"] | pkgname = data["id"] | ||||||||||||
nuspec_url = ( | nuspec_url = ( | ||||||||||||
f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/" | f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/" | ||||||||||||
f"{data['version'].lower()}/{pkgname.lower()}.nuspec" | f"{data['version'].lower()}/{pkgname.lower()}.nuspec" | ||||||||||||
) | ) | ||||||||||||
try: | try: | ||||||||||||
res_metadata = self.http_request(url=nuspec_url) | res_metadata = self.http_request(url=nuspec_url) | ||||||||||||
except HTTPError: | except HTTPError: | ||||||||||||
logger.warning( | logger.warning( | ||||||||||||
"Failed to fetch nuspec file %s, skipping it from listing.", | "Failed to fetch nuspec file %s, skipping it from listing.", | ||||||||||||
nuspec_url, | nuspec_url, | ||||||||||||
) | ) | ||||||||||||
continue | continue | ||||||||||||
xml = BeautifulSoup(res_metadata.content, "xml") | xml = BeautifulSoup(res_metadata.content, "xml") | ||||||||||||
repo = xml.find("repository") | repo = xml.find("repository") | ||||||||||||
if repo and "url" in repo.attrs and "type" in repo.attrs: | if repo and "url" in repo.attrs and "type" in repo.attrs: | ||||||||||||
vcs_url = repo.attrs["url"] | vcs_url = repo.attrs["url"] | ||||||||||||
vcs_type = repo.attrs["type"] | vcs_type = repo.attrs["type"] | ||||||||||||
last_update = iso8601.parse_date(elt["commitTimeStamp"]) | |||||||||||||
yield ListedOrigin( | yield ListedOrigin( | ||||||||||||
Done Inline Actionsit is possible for commitTimeStamp to be missing? if yes, please add a test for it vlorentz: it is possible for `commitTimeStamp` to be missing? if yes, please add a test for it | |||||||||||||
Done Inline ActionsNo it can't. From https://learn.microsoft.com/en-us/nuget/api/catalog-resource#catalog-index
franckbret: No it can't. From https://learn.microsoft.com/en-us/nuget/api/catalog-resource#catalog-index… | |||||||||||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||||||||||
visit_type=vcs_type, | visit_type=vcs_type, | ||||||||||||
url=vcs_url, | url=vcs_url, | ||||||||||||
last_update=None, | last_update=last_update, | ||||||||||||
) | ) | ||||||||||||
else: | else: | ||||||||||||
continue | continue | ||||||||||||
def finalize(self) -> None: | |||||||||||||
self.state.last_listing_date = self.listing_date | |||||||||||||
self.updated = True |
unless it can be absent (in which case you need to do something with it instead of passing it to iso8601.parse_date)