diff --git a/swh/lister/nuget/__init__.py b/swh/lister/nuget/__init__.py --- a/swh/lister/nuget/__init__.py +++ b/swh/lister/nuget/__init__.py @@ -20,9 +20,13 @@ Nuget.org provides an `http api`_ with several endpoint to discover and list packages and versions. -The recommended way to retrieve all packages is to use the `catalog`_ api endpoint. -It provides a first endpoint that list all available pages. We then iterate to get -content of related pages. +The recommended way to `retrieve all packages`_ is to use the `catalog`_ api endpoint. +It provides a `catalog index endpoint`_ that list all available pages. We then iterate to +get content of related pages. + +The lister is incremental following a `cursor`_ principle, based on the value of +``commitTimeStamp`` from the catalog index endpoint. It retrieve only pages for which +``commitTimeStamp``is greater than ``lister.state.last_listing_date``. Page listing ------------ @@ -65,6 +69,9 @@ .. _nuget.org/packages: https://www.nuget.org/packages .. _http api: https://api.nuget.org/v3/index.json .. _catalog: https://learn.microsoft.com/en-us/nuget/api/catalog-resource +.. _catalog index endpoint: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#catalog-page-object-in-the-index # noqa: B950 +.. _retrieve all packages: https://learn.microsoft.com/en-us/nuget/guides/api/query-for-all-published-packages#initialize-a-cursor # noqa: B950 +.. _cursor: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#cursor .. _package metadata: https://learn.microsoft.com/en-us/nuget/api/registration-base-url-resource .. _package manifest: https://learn.microsoft.com/en-us/nuget/api/package-base-address-resource#download-package-manifest-nuspec # noqa: B950 """ diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -3,24 +3,36 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import dataclass +from datetime import datetime import logging -from typing import Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional from bs4 import BeautifulSoup +import iso8601 from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) + # Aliasing the page results returned by `get_pages` method from the lister. NugetListerPage = List[Dict[str, str]] -class NugetLister(StatelessLister[NugetListerPage]): +@dataclass +class NugetListerState: + """Store lister state for incremental mode operations""" + + last_listing_date: Optional[datetime] = None + """Last date from main http api endpoint when lister was executed""" + + +class NugetLister(Lister[NugetListerState, NugetListerPage]): """List Nuget (Package manager for .NET) origins.""" LISTER_NAME = "nuget" @@ -39,6 +51,20 @@ instance=self.INSTANCE, url=self.API_INDEX_URL, ) + self.listing_date: Optional[datetime] = None + + def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState: + last_listing_date = d.get("last_listing_date") + if last_listing_date is not None: + d["last_listing_date"] = iso8601.parse_date(last_listing_date) + return NugetListerState(**d) + + def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]: + d: Dict[str, Optional[str]] = {"last_listing_date": None} + last_listing_date = state.last_listing_date + if last_listing_date is not None: + d["last_listing_date"] = last_listing_date.isoformat() + return d def get_pages(self) -> Iterator[NugetListerPage]: """Yield an iterator which returns 'page' @@ -48,21 +74,33 @@ """ index_response = self.http_request(url=self.url) index = index_response.json() - assert "items" in index + assert "commitTimeStamp" in index + self.listing_date = iso8601.parse_date(index.get("commitTimeStamp")) + + assert "items" in index for page in index["items"]: + assert page["@id"] - try: - page_response = self.http_request(url=page["@id"]) - page_data = page_response.json() - assert "items" in page_data - yield page_data["items"] - except HTTPError: - logger.warning( - "Failed to fetch page %s, skipping it from listing.", - page["@id"], - ) - continue + assert page["commitTimeStamp"] + + commit_timestamp = iso8601.parse_date(page["commitTimeStamp"]) + + if not self.state.last_listing_date or ( + self.state.last_listing_date + and commit_timestamp > self.state.last_listing_date + ): + try: + page_response = self.http_request(url=page["@id"]) + page_data = page_response.json() + assert "items" in page_data + yield page_data["items"] + except HTTPError: + logger.warning( + "Failed to fetch page %s, skipping it from listing.", + page["@id"], + ) + continue def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances. @@ -91,6 +129,7 @@ f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/" f"{data['version'].lower()}/{pkgname.lower()}.nuspec" ) + try: res_metadata = self.http_request(url=nuspec_url) except HTTPError: @@ -104,11 +143,18 @@ if repo and "url" in repo.attrs and "type" in repo.attrs: vcs_url = repo.attrs["url"] vcs_type = repo.attrs["type"] + last_update = None + if "commitTimeStamp" in elt: + last_update = iso8601.parse_date(elt["commitTimeStamp"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=vcs_url, - last_update=None, + last_update=last_update, ) else: continue + + def finalize(self) -> None: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec b/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec new file mode 100644 --- /dev/null +++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec @@ -0,0 +1,25 @@ + + + + Moq.AutoMock + 3.5.0-ci0287 + Tim Kellogg, Adam Hewitt, Kevin Bost + LICENSE + https://aka.ms/deprecateLicenseUrl + https://github.com/moq/Moq.AutoMocker + An auto-mocking container that generates mocks using Moq + Copyright Tim Kellogg 2022 + + + + + + + + + + + + + + \ No newline at end of file diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json new file mode 100644 --- /dev/null +++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json @@ -0,0 +1,187 @@ +{ + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json", + "@type": [ + "PackageDetails", + "catalog:Permalink" + ], + "authors": "Tim Kellogg, Adam Hewitt, Kevin Bost", + "catalog:commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd", + "catalog:commitTimeStamp": "2022-10-10T04:04:00.6654802Z", + "copyright": "Copyright Tim Kellogg 2022", + "created": "2022-10-10T04:01:52.21Z", + "description": "An auto-mocking container that generates mocks using Moq", + "id": "Moq.AutoMock", + "isPrerelease": true, + "lastEdited": "2022-10-10T04:03:52.51Z", + "licenseFile": "LICENSE", + "licenseUrl": "https://aka.ms/deprecateLicenseUrl", + "listed": true, + "packageHash": "jtvxZ9lJGiNWCvKx4oZByy/knRu86ze833hZa2XvAbzYcSR3gSesdWgbGw1yNGDY0TuHobTETq/lorrtE2/pPA==", + "packageHashAlgorithm": "SHA512", + "packageSize": 70853, + "projectUrl": "https://github.com/moq/Moq.AutoMocker", + "published": "2022-10-10T04:01:52.21Z", + "repository": "", + "verbatimVersion": "3.5.0-ci0287", + "version": "3.5.0-ci0287", + "dependencyGroups": [ + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1", + "@type": "PackageDependencyGroup", + "dependencies": [ + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/moq", + "@type": "PackageDependency", + "id": "Moq", + "range": "[4.18.2, )" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/nonblocking", + "@type": "PackageDependency", + "id": "NonBlocking", + "range": "[2.1.0, )" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/system.valuetuple", + "@type": "PackageDependency", + "id": "System.ValueTuple", + "range": "[4.5.0, )" + } + ], + "targetFramework": ".NETFramework4.6.1" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0", + "@type": "PackageDependencyGroup", + "dependencies": [ + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/moq", + "@type": "PackageDependency", + "id": "Moq", + "range": "[4.18.2, )" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/nonblocking", + "@type": "PackageDependency", + "id": "NonBlocking", + "range": "[2.1.0, )" + } + ], + "targetFramework": ".NETStandard2.0" + } + ], + "packageEntries": [ + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#Moq.AutoMock.nuspec", + "@type": "PackageEntry", + "compressedLength": 567, + "fullName": "Moq.AutoMock.nuspec", + "length": 1287, + "name": "Moq.AutoMock.nuspec" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.dll", + "@type": "PackageEntry", + "compressedLength": 17993, + "fullName": "lib/net461/Moq.AutoMock.dll", + "length": 41984, + "name": "Moq.AutoMock.dll" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.xml", + "@type": "PackageEntry", + "compressedLength": 5031, + "fullName": "lib/net461/Moq.AutoMock.xml", + "length": 55041, + "name": "Moq.AutoMock.xml" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.dll", + "@type": "PackageEntry", + "compressedLength": 17927, + "fullName": "lib/netstandard2.0/Moq.AutoMock.dll", + "length": 41984, + "name": "Moq.AutoMock.dll" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.xml", + "@type": "PackageEntry", + "compressedLength": 5031, + "fullName": "lib/netstandard2.0/Moq.AutoMock.xml", + "length": 55041, + "name": "Moq.AutoMock.xml" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#LICENSE", + "@type": "PackageEntry", + "compressedLength": 628, + "fullName": "LICENSE", + "length": 1068, + "name": "LICENSE" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll", + "@type": "PackageEntry", + "compressedLength": 9686, + "fullName": "analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll", + "length": 25088, + "name": "Moq.AutoMocker.TestGenerator.dll" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#.signature.p7s", + "@type": "PackageEntry", + "compressedLength": 11534, + "fullName": ".signature.p7s", + "length": 11534, + "name": ".signature.p7s" + } + ], + "@context": { + "@vocab": "http://schema.nuget.org/schema#", + "catalog": "http://schema.nuget.org/catalog#", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "dependencies": { + "@id": "dependency", + "@container": "@set" + }, + "dependencyGroups": { + "@id": "dependencyGroup", + "@container": "@set" + }, + "packageEntries": { + "@id": "packageEntry", + "@container": "@set" + }, + "packageTypes": { + "@id": "packageType", + "@container": "@set" + }, + "supportedFrameworks": { + "@id": "supportedFramework", + "@container": "@set" + }, + "tags": { + "@id": "tag", + "@container": "@set" + }, + "vulnerabilities": { + "@id": "vulnerability", + "@container": "@set" + }, + "published": { + "@type": "xsd:dateTime" + }, + "created": { + "@type": "xsd:dateTime" + }, + "lastEdited": { + "@type": "xsd:dateTime" + }, + "catalog:commitTimeStamp": { + "@type": "xsd:dateTime" + }, + "reasons": { + "@container": "@set" + } + } +} \ No newline at end of file diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1 b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1 new file mode 100644 --- /dev/null +++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1 @@ -0,0 +1,46 @@ +{ + "@id": "https://api.nuget.org/v3/catalog0/index.json", + "@type": [ + "CatalogRoot", + "AppendOnlyCatalog", + "Permalink" + ], + "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698", + "commitTimeStamp": "2022-10-10T04:20:52.8660454Z", + "count": 16959, + "nuget:lastCreated": "2022-10-10T04:20:52.8660454Z", + "nuget:lastDeleted": "2022-10-10T04:20:52.8660454Z", + "nuget:lastEdited": "2022-10-10T04:20:52.8660454Z", + "items": [ + { + "@id": "https://api.nuget.org/v3/catalog0/page17100.json", + "@type": "CatalogPage", + "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698", + "commitTimeStamp": "2022-10-10T04:20:52.8660454Z", + "count": 545 + } + ], + "@context": { + "@vocab": "http://schema.nuget.org/catalog#", + "nuget": "http://schema.nuget.org/schema#", + "items": { + "@id": "item", + "@container": "@set" + }, + "parent": { + "@type": "@id" + }, + "commitTimeStamp": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastCreated": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastEdited": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastDeleted": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + } + } +} diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json new file mode 100644 --- /dev/null +++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json @@ -0,0 +1,49 @@ +{ + "@id": "https://api.nuget.org/v3/catalog0/page17100.json", + "@type": "CatalogPage", + "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698", + "commitTimeStamp": "2022-10-10T04:20:52.8660454Z", + "count": 545, + "parent": "https://api.nuget.org/v3/catalog0/index.json", + "items": [ + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json", + "@type": "nuget:PackageDetails", + "commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd", + "commitTimeStamp": "2022-10-10T04:04:00.6654802Z", + "nuget:id": "Moq.AutoMock", + "nuget:version": "3.5.0-ci0287" + }, + { + "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.20.52/alzabox.api.sdk.0.0.13.json", + "@type": "nuget:PackageDetails", + "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698", + "commitTimeStamp": "2022-10-10T04:20:52.8660454Z", + "nuget:id": "Alzabox.API.SDK", + "nuget:version": "0.0.13" + } + ], + "@context": { + "@vocab": "http://schema.nuget.org/catalog#", + "nuget": "http://schema.nuget.org/schema#", + "items": { + "@id": "item", + "@container": "@set" + }, + "parent": { + "@type": "@id" + }, + "commitTimeStamp": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastCreated": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastEdited": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "nuget:lastDeleted": { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + } + } +} diff --git a/swh/lister/nuget/tests/test_lister.py b/swh/lister/nuget/tests/test_lister.py --- a/swh/lister/nuget/tests/test_lister.py +++ b/swh/lister/nuget/tests/test_lister.py @@ -6,6 +6,7 @@ from swh.lister.nuget.lister import NugetLister expected_origins = ["https://github.com/sillsdev/libpalaso.git"] +expected_origins_incremental = ["https://github.com/moq/Moq.AutoMocker"] def test_nuget_lister(datadir, requests_mock_datadir, swh_scheduler): @@ -32,3 +33,104 @@ ) for url in expected_origins ] + + +def test_nuget_lister_incremental(datadir, requests_mock_datadir_visits, swh_scheduler): + # First run + lister = NugetLister(scheduler=swh_scheduler) + assert lister.state.last_listing_date is None + + res = lister.run() + assert res.pages == 2 + assert res.origins == 1 + assert lister.state.last_listing_date + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "git", + url, + ) + for url in expected_origins + ] + + last_date = lister.state.last_listing_date + + # Second run + lister = NugetLister(scheduler=swh_scheduler) + assert lister.state.last_listing_date == last_date + res = lister.run() + # One page and one new origin + assert lister.state.last_listing_date > last_date + assert res.pages == 1 + assert res.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len( + expected_origins + expected_origins_incremental + ) + assert [ + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "git", + url, + ) + for url in sorted(expected_origins + expected_origins_incremental) + ] + + +def test_nuget_lister_incremental_no_changes( + datadir, requests_mock_datadir, swh_scheduler +): + # First run + lister = NugetLister(scheduler=swh_scheduler) + assert lister.state.last_listing_date is None + + res = lister.run() + assert res.pages == 2 + assert res.origins == 1 + assert lister.state.last_listing_date + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "git", + url, + ) + for url in expected_origins + ] + + last_date = lister.state.last_listing_date + + # Second run + lister = NugetLister(scheduler=swh_scheduler) + assert lister.state.last_listing_date == last_date + res = lister.run() + # Nothing new + assert lister.state.last_listing_date == last_date + assert res.pages == 0 + assert res.origins == 0