diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index d0c6984..63604a1 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,145 +1,162 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging from pathlib import Path import subprocess from typing import Any, Dict, Iterator, List +from urllib.parse import urlparse import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CratesListerPage = List[Dict[str, Any]] class CratesLister(StatelessLister[CratesListerPage]): """List origins from the "crates.io" forge. It basically fetches https://github.com/rust-lang/crates.io-index.git to a temp directory and then walks through each file to get the crate's info. """ # Part of the lister API, that identifies this lister LISTER_NAME = "crates" # (Optional) CVS type of the origins listed by this lister, if constant - VISIT_TYPE = "rust-crate" + VISIT_TYPE = "crates" INSTANCE = "crates" INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" DESTINATION_PATH = Path("/tmp/crates.io-index") CRATE_FILE_URL_PATTERN = ( "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" ) + CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.INDEX_REPOSITORY_URL, instance=self.INSTANCE, ) def get_index_repository(self) -> None: """Get crates.io-index repository up to date running git command.""" subprocess.check_call( [ "git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH, ] ) def get_crates_index(self) -> List[Path]: """Build a sorted list of file paths excluding dotted directories and dotted files. Each file path corresponds to a crate that lists all available versions. """ crates_index = sorted( path for path in self.DESTINATION_PATH.rglob("*") if not any(part.startswith(".") for part in path.parts) and path.is_file() and path != self.DESTINATION_PATH / "config.json" ) return crates_index def get_pages(self) -> Iterator[CratesListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of crate versions with: - name: Name of the crate - version: Version - checksum: Checksum - crate_file: Url of the crate file - last_update: Date of the last commit of the corresponding index file """ # Fetch crates.io index repository self.get_index_repository() # Get a list of all crates files from the index repository crates_index = self.get_crates_index() logger.debug("found %s crates in crates_index", len(crates_index)) for crate in crates_index: page = [] # %cI is for strict iso8601 date formatting last_update_str = subprocess.check_output( ["git", "log", "-1", "--pretty=format:%cI", str(crate)], cwd=self.DESTINATION_PATH, ) last_update = iso8601.parse_date(last_update_str.decode().strip()) with crate.open("rb") as current_file: for line in current_file: data = json.loads(line) # pick only the data we need page.append( dict( name=data["name"], version=data["vers"], checksum=data["cksum"], crate_file=self.CRATE_FILE_URL_PATTERN.format( crate=data["name"], version=data["vers"] ), last_update=last_update, ) ) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None + url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) + last_update = page[0]["last_update"] + artifacts = [] + for version in page: - yield ListedOrigin( - lister_id=self.lister_obj.id, - visit_type=self.VISIT_TYPE, - url=version["crate_file"], - last_update=version["last_update"], - extra_loader_arguments={ - "name": version["name"], - "version": version["version"], - "checksum": version["checksum"], + filename = urlparse(version["crate_file"]).path.split("/")[-1] + # Build an artifact entry following original-artifacts-json specification + # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 + artifact = { + "filename": f"{filename}", + "checksums": { + "sha256": f"{version['checksum']}", }, - ) + "url": version["crate_file"], + "version": version["version"], + } + artifacts.append(artifact) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + }, + ) diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py index b92ce56..bbb1c7d 100644 --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -1,89 +1,114 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path from swh.lister.crates.lister import CratesLister from swh.lister.crates.tests import prepare_repository_from_archive expected_origins = [ { - "name": "rand", - "version": "0.1.1", - "checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", - "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", + "url": "https://crates.io/api/v1/crates/rand", + "artifacts": [ + { + "checksums": { + "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 + }, + "filename": "rand-0.1.1.crate", + "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", + "version": "0.1.1", + }, + { + "checksums": { + "sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950 + }, + "filename": "rand-0.1.2.crate", + "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", + "version": "0.1.2", + }, + ], }, { - "name": "rand", - "version": "0.1.2", - "checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", - "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", + "url": "https://crates.io/api/v1/crates/regex", + "artifacts": [ + { + "checksums": { + "sha256": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", # noqa: B950 + }, + "filename": "regex-0.1.0.crate", + "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate", + "version": "0.1.0", + }, + { + "checksums": { + "sha256": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", # noqa: B950 + }, + "filename": "regex-0.1.1.crate", + "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate", + "version": "0.1.1", + }, + { + "checksums": { + "sha256": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", # noqa: B950 + }, + "filename": "regex-0.1.2.crate", + "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", + "version": "0.1.2", + }, + { + "checksums": { + "sha256": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", # noqa: B950 + }, + "filename": "regex-0.1.3.crate", + "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", + "version": "0.1.3", + }, + ], }, { - "name": "regex", - "version": "0.1.0", - "checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", - "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate", - }, - { - "name": "regex", - "version": "0.1.1", - "checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", - "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate", - }, - { - "name": "regex", - "version": "0.1.2", - "checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", - "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", - }, - { - "name": "regex", - "version": "0.1.3", - "checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", - "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", - }, - { - "name": "regex-syntax", - "version": "0.1.0", - "checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", - "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", + "url": "https://crates.io/api/v1/crates/regex-syntax", + "artifacts": [ + { + "checksums": { + "sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950 + }, + "filename": "regex-syntax-0.1.0.crate", + "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", + "version": "0.1.0", + }, + ], }, ] def test_crates_lister(datadir, tmp_path, swh_scheduler): archive_path = Path(datadir, "fake-crates-repository.tar.gz") repo_url = prepare_repository_from_archive( archive_path, "crates.io-index", tmp_path ) lister = CratesLister(scheduler=swh_scheduler) lister.INDEX_REPOSITORY_URL = repo_url lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" res = lister.run() assert res.pages == 3 - assert res.origins == 7 + assert res.origins == 3 expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) scheduler_origins_sorted = sorted( swh_scheduler.get_listed_origins(lister.lister_obj.id).results, key=lambda x: x.url, ) for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): - assert scheduled.visit_type == "rust-crate" + assert scheduled.visit_type == "crates" assert scheduled.url == expected.get("url") - assert scheduled.extra_loader_arguments.get("name") == expected.get("name") - assert scheduled.extra_loader_arguments.get("version") == expected.get( - "version" - ) - assert scheduled.extra_loader_arguments.get("checksum") == expected.get( - "checksum" + assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( + "artifacts" ) assert len(scheduler_origins_sorted) == len(expected_origins_sorted)