diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -2,13 +2,10 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime -import gzip -import json import logging -from pathlib import Path -import shutil -from typing import Any, Dict, Iterator, Optional +from typing import Any, Dict, Iterator, List, Optional import requests @@ -47,8 +44,6 @@ PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" - DESTINATION_PATH = Path("/tmp/aur_archive") - def __init__( self, scheduler: SchedulerInterface, @@ -61,7 +56,7 @@ url=self.BASE_URL, ) - def download_index_archive(self) -> Path: + def download_packages_index(self) -> List[Dict[str, Any]]: """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, and download the archive to self.DESTINATION_PATH @@ -69,16 +64,7 @@ a directory Path where the archive has been downloaded to. """ url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) - filename = url.split("/")[-1] - destination = self.DESTINATION_PATH / filename - - self.DESTINATION_PATH.mkdir(exist_ok=True) - - response = requests.get(url, stream=True) - destination.write_bytes(response.raw.read()) - assert destination.exists() - - return destination + return requests.get(url).json() def get_pages(self) -> Iterator[AurListerPage]: """Yield an iterator which returns 'page' @@ -88,27 +74,21 @@ a canonical 'snapshot_url' from which a tar.gz archive of the package can be downloaded. """ - index = self.download_index_archive() - - with gzip.open(index, "rb") as f: - assert f.readable() - file_content = f.read() - packages = json.loads(file_content) + packages = self.download_packages_index() - assert packages + logger.debug("Found %s AUR packages in aur_index", len(packages)) - counter: int = 0 for package in packages: # Exclude lines where Name differs from PackageBase as they represents # split package and they don't have resolvable snapshots url if package["Name"] == package["PackageBase"]: + logger.debug("Processing AUR package %s", package["Name"]) pkgname = package["PackageBase"] version = package["Version"] project_url = package["URL"] last_modified = datetime.datetime.fromtimestamp( float(package["LastModified"]), tz=datetime.timezone.utc ).isoformat() - counter += 1 yield { "pkgname": pkgname, "version": version, @@ -121,7 +101,6 @@ "project_url": project_url, "last_modified": last_modified, } - logger.debug("Found %s AUR packages in aur_index", counter) def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances. @@ -163,11 +142,3 @@ "aur_metadata": aur_metadata, }, ) - - def finalize(self) -> None: - # Cleanup by removing the repository directory - if self.DESTINATION_PATH.exists(): - shutil.rmtree(self.DESTINATION_PATH) - logger.debug( - "Successfully removed %s directory", str(self.DESTINATION_PATH) - ) diff --git a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/packages-meta-v1.json.gz rename from swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz rename to swh/lister/aur/tests/data/packages-meta-v1.json.gz diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py --- a/swh/lister/aur/tests/test_lister.py +++ b/swh/lister/aur/tests/test_lister.py @@ -2,6 +2,11 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +import gzip +import json +import os + from swh.lister.aur.lister import AurLister expected_origins = [ @@ -92,13 +97,22 @@ ] -def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler): +def test_aur_lister(datadir, swh_scheduler, requests_mock): + lister = AurLister(scheduler=swh_scheduler) + + packages_index_filename = "packages-meta-v1.json.gz" + + # simulate requests behavior: gzip and deflate transfer-encodings are automatically decoded + with gzip.open(os.path.join(datadir, packages_index_filename), "rb") as f: + requests_mock.get( + f"{lister.BASE_URL}/{packages_index_filename}", json=json.loads(f.read()) + ) + res = lister.run() assert res.pages == 4 assert res.origins == 4 - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert [ @@ -116,10 +130,3 @@ ) for expected in sorted(expected_origins, key=lambda expected: expected["url"]) ] - - -def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler): - lister = AurLister(scheduler=swh_scheduler) - lister.run() - # Repository directory should not exists after the lister runs - assert not lister.DESTINATION_PATH.exists()