diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,6 +17,7 @@ - id: codespell name: Check source code spelling exclude: ^(swh/lister/.*/tests/data/.*)$ + args: [-L crate] stages: [commit] - id: codespell name: Check commit message spelling diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ lister.bitbucket=swh.lister.bitbucket:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register + lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register diff --git a/swh/lister/crates/__init__.py b/swh/lister/crates/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import CratesLister + + return { + "lister": CratesLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/lister.py @@ -0,0 +1,138 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +from pathlib import Path +import subprocess +from typing import Any, Dict, Iterator, List + +import iso8601 + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +CratesListerPage = List[Dict[str, Any]] + + +class CratesLister(StatelessLister[CratesListerPage]): + """List origins from the "crates.io" forge. + + It basically fetches https://github.com/rust-lang/crates.io-index.git to a + temp directory and then walks through each file to get the crate's info. + """ + + # Part of the lister API, that identifies this lister + LISTER_NAME = "crates" + # (Optional) CVS type of the origins listed by this lister, if constant + VISIT_TYPE = "rust-crate" + + INSTANCE = "crates" + INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" + DESTINATION_PATH = Path("/tmp/crates.io-index") + CRATE_FILE_URL_PATTERN = ( + "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" + ) + + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + url=self.INDEX_REPOSITORY_URL, + instance=self.INSTANCE, + ) + + def get_index_repository(self) -> None: + """Get crates.io-index repository up to date running git command.""" + + subprocess.check_call( + ["git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH,] + ) + + def get_crates_index(self) -> List[Path]: + """Build a sorted list of file paths excluding dotted directories and + dotted files. + + Each file path corresponds to a crate that lists all available + versions. + """ + + crates_index = sorted( + path + for path in self.DESTINATION_PATH.rglob("*") + if not any(part.startswith(".") for part in path.parts) + and path.is_file() + and path != self.DESTINATION_PATH / "config.json" + ) + + return crates_index + + def get_pages(self) -> Iterator[CratesListerPage]: + """Yield an iterator sorted by name in ascending order of pages. + + Each page is a list of crate versions with: + - name: Name of the crate + - version: Version + - checksum: Checksum + - crate_file: Url of the crate file + - last_update: Date of the last commit of the corresponding index + file + """ + # Fetch crates.io index repository + self.get_index_repository() + # Get a list of all crates files from the index repository + crates_index = self.get_crates_index() + logger.debug("found %s crates in crates_index", len(crates_index)) + + for crate in crates_index: + page = [] + # %cI is for strict iso8601 date formatting + last_update_str = subprocess.check_output( + ["git", "log", "-1", "--pretty=format:%cI", str(crate)], + cwd=self.DESTINATION_PATH, + ) + last_update = iso8601.parse_date(last_update_str.decode().strip()) + + with crate.open("rb") as current_file: + for line in current_file: + data = json.loads(line) + # pick only the data we need + page.append( + dict( + name=data["name"], + version=data["vers"], + checksum=data["cksum"], + crate_file=self.CRATE_FILE_URL_PATTERN.format( + crate=data["name"], version=data["vers"] + ), + last_update=last_update, + ) + ) + yield page + + def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: + """Iterate on all crate pages and yield ListedOrigin instances.""" + + assert self.lister_obj.id is not None + + for version in page: + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=version["crate_file"], + last_update=version["last_update"], + extra_loader_arguments={ + "name": version["name"], + "version": version["version"], + "checksum": version["checksum"], + }, + ) diff --git a/swh/lister/crates/tasks.py b/swh/lister/crates/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.crates.lister import CratesLister + + +@shared_task(name=__name__ + ".CratesListerTask") +def list_crates(**lister_args): + """Lister task for crates (rust) registry""" + return CratesLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/crates/tests/__init__.py b/swh/lister/crates/tests/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/tests/__init__.py @@ -0,0 +1,29 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from pathlib import PosixPath +import subprocess +from typing import Optional, Union + + +def prepare_repository_from_archive( + archive_path: str, + filename: Optional[str] = None, + tmp_path: Union[PosixPath, str] = "/tmp", +) -> str: + """Given an existing archive_path, uncompress it. + Returns a file repo url which can be used as origin url. + + This does not deal with the case where the archive passed along does not exist. + + """ + if not isinstance(tmp_path, str): + tmp_path = str(tmp_path) + # uncompress folder/repositories/dump for the loader to ingest + subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) + # build the origin url (or some derivative form) + _fname = filename if filename else os.path.basename(archive_path) + repo_url = f"file://{tmp_path}/{_fname}" + return repo_url diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ ra/nd/rand +echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand + +echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex +echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex +echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex +echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex + +echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax + +# Init as a git repository +git init +git add . +git commit -m "Init fake crates.io-index repository for tests purpose" + +# Save some space +rm .git/hooks/*.sample diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/tests/test_lister.py @@ -0,0 +1,89 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +from swh.lister.crates.lister import CratesLister +from swh.lister.crates.tests import prepare_repository_from_archive + +expected_origins = [ + { + "name": "rand", + "version": "0.1.1", + "checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", + "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", + }, + { + "name": "rand", + "version": "0.1.2", + "checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", + "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", + }, + { + "name": "regex", + "version": "0.1.0", + "checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", + "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate", + }, + { + "name": "regex", + "version": "0.1.1", + "checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", + "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate", + }, + { + "name": "regex", + "version": "0.1.2", + "checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", + "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", + }, + { + "name": "regex", + "version": "0.1.3", + "checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", + "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", + }, + { + "name": "regex-syntax", + "version": "0.1.0", + "checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", + "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", + }, +] + + +def test_crates_lister(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + + res = lister.run() + + assert res.pages == 3 + assert res.origins == 7 + + expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): + assert scheduled.visit_type == "rust-crate" + assert scheduled.url == expected.get("url") + assert scheduled.extra_loader_arguments.get("name") == expected.get("name") + assert scheduled.extra_loader_arguments.get("version") == expected.get( + "version" + ) + assert scheduled.extra_loader_arguments.get("checksum") == expected.get( + "checksum" + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) diff --git a/swh/lister/crates/tests/test_tasks.py b/swh/lister/crates/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/crates/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_crates_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked CratesLister + lister = mocker.patch("swh.lister.crates.tasks.CratesLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()