Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7437785
D7367.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
D7367.diff
View Options
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,6 +17,7 @@
- id: codespell
name: Check source code spelling
exclude: ^(swh/lister/.*/tests/data/.*)$
+ args: [-L crate]
stages: [commit]
- id: codespell
name: Check commit message spelling
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
+ lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
diff --git a/swh/lister/crates/__init__.py b/swh/lister/crates/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import CratesLister
+
+ return {
+ "lister": CratesLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/lister.py
@@ -0,0 +1,138 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+import logging
+from pathlib import Path
+import subprocess
+from typing import Any, Dict, Iterator, List
+
+import iso8601
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CratesListerPage = List[Dict[str, Any]]
+
+
+class CratesLister(StatelessLister[CratesListerPage]):
+ """List origins from the "crates.io" forge.
+
+ It basically fetches https://github.com/rust-lang/crates.io-index.git to a
+ temp directory and then walks through each file to get the crate's info.
+ """
+
+ # Part of the lister API, that identifies this lister
+ LISTER_NAME = "crates"
+ # (Optional) CVS type of the origins listed by this lister, if constant
+ VISIT_TYPE = "rust-crate"
+
+ INSTANCE = "crates"
+ INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"
+ DESTINATION_PATH = Path("/tmp/crates.io-index")
+ CRATE_FILE_URL_PATTERN = (
+ "https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
+ )
+
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ url=self.INDEX_REPOSITORY_URL,
+ instance=self.INSTANCE,
+ )
+
+ def get_index_repository(self) -> None:
+ """Get crates.io-index repository up to date running git command."""
+
+ subprocess.check_call(
+ ["git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH,]
+ )
+
+ def get_crates_index(self) -> List[Path]:
+ """Build a sorted list of file paths excluding dotted directories and
+ dotted files.
+
+ Each file path corresponds to a crate that lists all available
+ versions.
+ """
+
+ crates_index = sorted(
+ path
+ for path in self.DESTINATION_PATH.rglob("*")
+ if not any(part.startswith(".") for part in path.parts)
+ and path.is_file()
+ and path != self.DESTINATION_PATH / "config.json"
+ )
+
+ return crates_index
+
+ def get_pages(self) -> Iterator[CratesListerPage]:
+ """Yield an iterator sorted by name in ascending order of pages.
+
+ Each page is a list of crate versions with:
+ - name: Name of the crate
+ - version: Version
+ - checksum: Checksum
+ - crate_file: Url of the crate file
+ - last_update: Date of the last commit of the corresponding index
+ file
+ """
+ # Fetch crates.io index repository
+ self.get_index_repository()
+ # Get a list of all crates files from the index repository
+ crates_index = self.get_crates_index()
+ logger.debug("found %s crates in crates_index", len(crates_index))
+
+ for crate in crates_index:
+ page = []
+ # %cI is for strict iso8601 date formatting
+ last_update_str = subprocess.check_output(
+ ["git", "log", "-1", "--pretty=format:%cI", str(crate)],
+ cwd=self.DESTINATION_PATH,
+ )
+ last_update = iso8601.parse_date(last_update_str.decode().strip())
+
+ with crate.open("rb") as current_file:
+ for line in current_file:
+ data = json.loads(line)
+ # pick only the data we need
+ page.append(
+ dict(
+ name=data["name"],
+ version=data["vers"],
+ checksum=data["cksum"],
+ crate_file=self.CRATE_FILE_URL_PATTERN.format(
+ crate=data["name"], version=data["vers"]
+ ),
+ last_update=last_update,
+ )
+ )
+ yield page
+
+ def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all crate pages and yield ListedOrigin instances."""
+
+ assert self.lister_obj.id is not None
+
+ for version in page:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=version["crate_file"],
+ last_update=version["last_update"],
+ extra_loader_arguments={
+ "name": version["name"],
+ "version": version["version"],
+ "checksum": version["checksum"],
+ },
+ )
diff --git a/swh/lister/crates/tasks.py b/swh/lister/crates/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.crates.lister import CratesLister
+
+
+@shared_task(name=__name__ + ".CratesListerTask")
+def list_crates(**lister_args):
+ """Lister task for crates (rust) registry"""
+ return CratesLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/crates/tests/__init__.py b/swh/lister/crates/tests/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from pathlib import PosixPath
+import subprocess
+from typing import Optional, Union
+
+
+def prepare_repository_from_archive(
+ archive_path: str,
+ filename: Optional[str] = None,
+ tmp_path: Union[PosixPath, str] = "/tmp",
+) -> str:
+ """Given an existing archive_path, uncompress it.
+ Returns a file repo url which can be used as origin url.
+
+ This does not deal with the case where the archive passed along does not exist.
+
+ """
+ if not isinstance(tmp_path, str):
+ tmp_path = str(tmp_path)
+ # uncompress folder/repositories/dump for the loader to ingest
+ subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path])
+ # build the origin url (or some derivative form)
+ _fname = filename if filename else os.path.basename(archive_path)
+ repo_url = f"file://{tmp_path}/{_fname}"
+ return repo_url
diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/crates/tests/data/fake_crates_repository_init.sh b/swh/lister/crates/tests/data/fake_crates_repository_init.sh
new file mode 100755
--- /dev/null
+++ b/swh/lister/crates/tests/data/fake_crates_repository_init.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Script to generate fake-crates-repository.tar.gz
+# Creates a git repository like https://github.com/rust-lang/crates.io-index
+# for tests purposes
+
+set -euo pipefail
+
+# files and directories
+mkdir -p tmp_dir/crates.io-index/
+cd tmp_dir/crates.io-index/
+
+mkdir -p .dot-dir
+touch .dot-dir/empty
+mkdir -p ra/nd
+mkdir -p re/ge
+
+touch .dot-file
+touch config.json
+
+echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
+echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
+
+echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
+echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
+echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
+echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
+
+echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
+
+# Init as a git repository
+git init
+git add .
+git commit -m "Init fake crates.io-index repository for tests purpose"
+
+# Save some space
+rm .git/hooks/*.sample
diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/test_lister.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+
+from swh.lister.crates.lister import CratesLister
+from swh.lister.crates.tests import prepare_repository_from_archive
+
+expected_origins = [
+ {
+ "name": "rand",
+ "version": "0.1.1",
+ "checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d",
+ "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
+ },
+ {
+ "name": "rand",
+ "version": "0.1.2",
+ "checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7",
+ "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.0",
+ "checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.1",
+ "checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.2",
+ "checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
+ },
+ {
+ "name": "regex",
+ "version": "0.1.3",
+ "checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3",
+ "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
+ },
+ {
+ "name": "regex-syntax",
+ "version": "0.1.0",
+ "checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944",
+ "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
+ },
+]
+
+
+def test_crates_lister(datadir, tmp_path, swh_scheduler):
+ archive_path = Path(datadir, "fake-crates-repository.tar.gz")
+ repo_url = prepare_repository_from_archive(
+ archive_path, "crates.io-index", tmp_path
+ )
+
+ lister = CratesLister(scheduler=swh_scheduler)
+ lister.INDEX_REPOSITORY_URL = repo_url
+ lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
+
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 7
+
+ expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
+ scheduler_origins_sorted = sorted(
+ swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
+ key=lambda x: x.url,
+ )
+
+ for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
+ assert scheduled.visit_type == "rust-crate"
+ assert scheduled.url == expected.get("url")
+ assert scheduled.extra_loader_arguments.get("name") == expected.get("name")
+ assert scheduled.extra_loader_arguments.get("version") == expected.get(
+ "version"
+ )
+ assert scheduled.extra_loader_arguments.get("checksum") == expected.get(
+ "checksum"
+ )
+
+ assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
diff --git a/swh/lister/crates/tests/test_tasks.py b/swh/lister/crates/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/crates/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_crates_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CratesLister
+ lister = mocker.patch("swh.lister.crates.tasks.CratesLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 14, 7:32 AM (10 h, 50 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216119
Attached To
D7367: Start rust crates lister
Event Timeline
Log In to Comment