Page MenuHomeSoftware Heritage

D7367.diff
No OneTemporary

D7367.diff

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,6 +17,7 @@
- id: codespell
name: Check source code spelling
exclude: ^(swh/lister/.*/tests/data/.*)$
+ args: [-L crate]
stages: [commit]
- id: codespell
name: Check commit message spelling
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
+ lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
diff --git a/swh/lister/crates/__init__.py b/swh/lister/crates/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import CratesLister
+
+ return {
+ "lister": CratesLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/lister.py
@@ -0,0 +1,138 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+import logging
+from pathlib import Path
+import subprocess
+from typing import Any, Dict, Iterator, List
+
+import iso8601
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CratesListerPage = List[Dict[str, Any]]
+
+
+class CratesLister(StatelessLister[CratesListerPage]):
+ """List origins from the "crates.io" forge.
+
+ It basically fetches https://github.com/rust-lang/crates.io-index.git to a
+ temp directory and then walks through each file to get the crate's info.
+ """
+
+ # Part of the lister API, that identifies this lister
+ LISTER_NAME = "crates"
+ # (Optional) CVS type of the origins listed by this lister, if constant
+ VISIT_TYPE = "rust-crate"
+
+ INSTANCE = "crates"
+ INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"
+ DESTINATION_PATH = Path("/tmp/crates.io-index")
+ CRATE_FILE_URL_PATTERN = (
+ "https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
+ )
+
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ url=self.INDEX_REPOSITORY_URL,
+ instance=self.INSTANCE,
+ )
+
+ def get_index_repository(self) -> None:
+ """Get crates.io-index repository up to date running git command."""
+
+ subprocess.check_call(
+ ["git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH,]
+ )
+
+ def get_crates_index(self) -> List[Path]:
+ """Build a sorted list of file paths excluding dotted directories and
+ dotted files.
+
+ Each file path corresponds to a crate that lists all available
+ versions.
+ """
+
+ crates_index = sorted(
+ path
+ for path in self.DESTINATION_PATH.rglob("*")
+ if not any(part.startswith(".") for part in path.parts)
+ and path.is_file()
+ and path != self.DESTINATION_PATH / "config.json"
+ )
+
+ return crates_index
+
+ def get_pages(self) -> Iterator[CratesListerPage]:
+ """Yield an iterator sorted by name in ascending order of pages.
+
+ Each page is a list of crate versions with:
+ - name: Name of the crate
+ - version: Version
+ - checksum: Checksum
+ - crate_file: Url of the crate file
+ - last_update: Date of the last commit of the corresponding index
+ file
+ """
+ # Fetch crates.io index repository
+ self.get_index_repository()
+ # Get a list of all crates files from the index repository
+ crates_index = self.get_crates_index()
+ logger.debug("found %s crates in crates_index", len(crates_index))
+
+ for crate in crates_index:
+ page = []
+ # %cI is for strict iso8601 date formatting
+ last_update_str = subprocess.check_output(
+ ["git", "log", "-1", "--pretty=format:%cI", str(crate)],
+ cwd=self.DESTINATION_PATH,
+ )
+ last_update = iso8601.parse_date(last_update_str.decode().strip())
+
+ with crate.open("rb") as current_file:
+ for line in current_file:
+ data = json.loads(line)
+ # pick only the data we need
+ page.append(
+ dict(
+ name=data["name"],
+ version=data["vers"],
+ checksum=data["cksum"],
+ crate_file=self.CRATE_FILE_URL_PATTERN.format(
+ crate=data["name"], version=data["vers"]
+ ),
+ last_update=last_update,
+ )
+ )
+ yield page
+
+ def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all crate pages and yield ListedOrigin instances."""
+
+ assert self.lister_obj.id is not None
+
+ for version in page:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=version["crate_file"],
+ last_update=version["last_update"],
+ extra_loader_arguments={
+ "name": version["name"],
+ "version": version["version"],
+ "checksum": version["checksum"],
+ },
+ )
diff --git a/swh/lister/crates/tasks.py b/swh/lister/crates/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.crates.lister import CratesLister
+
+
+@shared_task(name=__name__ + ".CratesListerTask")
+def list_crates(**lister_args):
+ """Lister task for crates (rust) registry"""
+ return CratesLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/crates/tests/__init__.py b/swh/lister/crates/tests/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from pathlib import PosixPath
+import subprocess
+from typing import Optional, Union
+
+
+def prepare_repository_from_archive(
+ archive_path: str,
+ filename: Optional[str] = None,
+ tmp_path: Union[PosixPath, str] = "/tmp",
+) -> str:
+ """Given an existing archive_path, uncompress it.
+ Returns a file repo url which can be used as origin url.
+
+ This does not deal with the case where the archive passed along does not exist.
+
+ """
+ if not isinstance(tmp_path, str):
+ tmp_path = str(tmp_path)
+ # uncompress folder/repositories/dump for the loader to ingest
+ subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path])
+ # build the origin url (or some derivative form)
+ _fname = filename if filename else os.path.basename(archive_path)
+ repo_url = f"file://{tmp_path}/{_fname}"
+ return repo_url
diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/crates/tests/data/fake_crates_repository_init.sh b/swh/lister/crates/tests/data/fake_crates_repository_init.sh
new file mode 100755
--- /dev/null
+++ b/swh/lister/crates/tests/data/fake_crates_repository_init.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Script to generate fake-crates-repository.tar.gz
+# Creates a git repository like https://github.com/rust-lang/crates.io-index
+# for tests purposes
+
+set -euo pipefail
+
+# files and directories
+mkdir -p tmp_dir/crates.io-index/
+cd tmp_dir/crates.io-index/
+
+mkdir -p .dot-dir
+touch .dot-dir/empty
+mkdir -p ra/nd
+mkdir -p re/ge
+
+touch .dot-file
+touch config.json
+
+echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
+echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
+
+echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
+echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
+echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
+echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
+
+echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
+
+# Init as a git repository
+git init
+git add .
+git commit -m "Init fake crates.io-index repository for tests purpose"
+
+# Save some space
+rm .git/hooks/*.sample
diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/test_lister.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+
+from swh.lister.crates.lister import CratesLister
+from swh.lister.crates.tests import prepare_repository_from_archive
+
+expected_origins = [
+ {
+ "name": "rand",
+ "version": "0.1.1",
+ "checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d",
+ "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
+ },
+ {
+ "name": "rand",
+ "version": "0.1.2",
+ "checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7",
+ "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.0",
+ "checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.1",
+ "checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.2",
+ "checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.3",
+ "checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
+ },
+ {
+ "name": "regex-syntax",
+ "version": "0.1.0",
+ "checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944",
+ "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
+ },
+]
+
+
+def test_crates_lister(datadir, tmp_path, swh_scheduler):
+ archive_path = Path(datadir, "fake-crates-repository.tar.gz")
+ repo_url = prepare_repository_from_archive(
+ archive_path, "crates.io-index", tmp_path
+ )
+
+ lister = CratesLister(scheduler=swh_scheduler)
+ lister.INDEX_REPOSITORY_URL = repo_url
+ lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
+
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 7
+
+ expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
+ scheduler_origins_sorted = sorted(
+ swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
+ key=lambda x: x.url,
+ )
+
+ for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
+ assert scheduled.visit_type == "rust-crate"
+ assert scheduled.url == expected.get("url")
+ assert scheduled.extra_loader_arguments.get("name") == expected.get("name")
+ assert scheduled.extra_loader_arguments.get("version") == expected.get(
+ "version"
+ )
+ assert scheduled.extra_loader_arguments.get("checksum") == expected.get(
+ "checksum"
+ )
+
+ assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
diff --git a/swh/lister/crates/tests/test_tasks.py b/swh/lister/crates/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_crates_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CratesLister
+ lister = mocker.patch("swh.lister.crates.tasks.CratesLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 14, 7:32 AM (10 h, 50 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216119

Event Timeline