Changeset View
Standalone View
swh/lister/crates/tests/test_lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||
from pathlib import Path | from pathlib import Path | ||||||||||||
from swh.lister.crates.lister import CratesLister | from dulwich.repo import Repo | ||||||||||||
from swh.lister.crates.lister import CratesLister, CratesListerState | |||||||||||||
from swh.lister.crates.tests import prepare_repository_from_archive | from swh.lister.crates.tests import prepare_repository_from_archive | ||||||||||||
expected_origins = [ | expected_origins = [ | ||||||||||||
{ | { | ||||||||||||
"url": "https://crates.io/api/v1/crates/rand", | "url": "https://crates.io/api/v1/crates/rand", | ||||||||||||
"artifacts": [ | "artifacts": [ | ||||||||||||
{ | { | ||||||||||||
"checksums": { | "checksums": { | ||||||||||||
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines | expected_origins = [ | ||||||||||||
"filename": "regex-syntax-0.1.0.crate", | "filename": "regex-syntax-0.1.0.crate", | ||||||||||||
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", | "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", | ||||||||||||
"version": "0.1.0", | "version": "0.1.0", | ||||||||||||
}, | }, | ||||||||||||
], | ], | ||||||||||||
}, | }, | ||||||||||||
] | ] | ||||||||||||
expected_origins_incremental = [ | |||||||||||||
{ | |||||||||||||
"url": "https://crates.io/api/v1/crates/regex", | |||||||||||||
"artifacts": [ | |||||||||||||
{ | |||||||||||||
"checksums": { | |||||||||||||
"sha256": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", # noqa: B950 | |||||||||||||
}, | |||||||||||||
"filename": "regex-0.1.3.crate", | |||||||||||||
"url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", | |||||||||||||
"version": "0.1.3", | |||||||||||||
}, | |||||||||||||
{ | |||||||||||||
"checksums": { | |||||||||||||
"sha256": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", # noqa: B950 | |||||||||||||
}, | |||||||||||||
"filename": "regex-0.1.2.crate", | |||||||||||||
"url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", | |||||||||||||
"version": "0.1.2", | |||||||||||||
vlorentz: Existing versions (`0.1.0` and `0.1.1`) are missing from the artifacts list. Wouldn't this… | |||||||||||||
Done Inline ActionsIt's test data for incremental mode, here I suppose 0.1.0 and 0.1.1 has already been loaded by a previous lister run. When the lister runs in incremental mode, what it get is based on the diff of the commits since the last run. Do I miss something? franckbret: It's test data for incremental mode, here I suppose 0.1.0 and 0.1.1 has already been loaded by… | |||||||||||||
Done Inline ActionsAt the end of each run, loaders create a Snapshot object, with a branch for each of the current versions of the package; so it needs to be aware of all versions, even if they were already loaded. Existing versions should only be missing from subsequent snapshots if they are removed from the origin. vlorentz: At the end of each run, loaders create a Snapshot object, with a branch for each of the current… | |||||||||||||
Done Inline ActionsOk, didn't understand it this way at first! So in incremental mode I can read each new or changed files since last_commit and return all versions found in the file. Each crate version entry has a "yanked" value. Is it ok for you if I return only version with yanked=false? franckbret: Ok, didn't understand it this way at first! So in incremental mode I can read each new or… | |||||||||||||
Not Done Inline ActionsHard question! We don't have a way to represent "soft deletions" like this in SWH. I err on the side of including it anyway; otherwise we would miss packages that were published and quickly yanked. It would be nice to have metadata somewhere (maybe extrinsic metadata on the snapshot object? or just rely on the git index being archived as a regular git repo) to keep a record of the yank status, but it should not affect the main archive. vlorentz: Hard question! We don't have a way to represent "soft deletions" like this in SWH.
I err on… | |||||||||||||
}, | |||||||||||||
], | |||||||||||||
}, | |||||||||||||
{ | |||||||||||||
"url": "https://crates.io/api/v1/crates/regex-syntax", | |||||||||||||
"artifacts": [ | |||||||||||||
{ | |||||||||||||
"checksums": { | |||||||||||||
"sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950 | |||||||||||||
}, | |||||||||||||
"filename": "regex-syntax-0.1.0.crate", | |||||||||||||
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", | |||||||||||||
"version": "0.1.0", | |||||||||||||
}, | |||||||||||||
], | |||||||||||||
}, | |||||||||||||
] | |||||||||||||
def test_crates_lister(datadir, tmp_path, swh_scheduler): | def test_crates_lister(datadir, tmp_path, swh_scheduler): | ||||||||||||
archive_path = Path(datadir, "fake-crates-repository.tar.gz") | archive_path = Path(datadir, "fake-crates-repository.tar.gz") | ||||||||||||
repo_url = prepare_repository_from_archive( | repo_url = prepare_repository_from_archive( | ||||||||||||
archive_path, "crates.io-index", tmp_path | archive_path, "crates.io-index", tmp_path | ||||||||||||
) | ) | ||||||||||||
lister = CratesLister(scheduler=swh_scheduler) | lister = CratesLister(scheduler=swh_scheduler) | ||||||||||||
Show All 14 Lines | def test_crates_lister(datadir, tmp_path, swh_scheduler): | ||||||||||||
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): | for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): | ||||||||||||
assert scheduled.visit_type == "crates" | assert scheduled.visit_type == "crates" | ||||||||||||
assert scheduled.url == expected.get("url") | assert scheduled.url == expected.get("url") | ||||||||||||
assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( | assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( | ||||||||||||
"artifacts" | "artifacts" | ||||||||||||
) | ) | ||||||||||||
assert len(scheduler_origins_sorted) == len(expected_origins_sorted) | assert len(scheduler_origins_sorted) == len(expected_origins_sorted) | ||||||||||||
def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler): | |||||||||||||
archive_path = Path(datadir, "fake-crates-repository.tar.gz") | |||||||||||||
repo_url = prepare_repository_from_archive( | |||||||||||||
archive_path, "crates.io-index", tmp_path | |||||||||||||
) | |||||||||||||
lister = CratesLister(scheduler=swh_scheduler) | |||||||||||||
lister.INDEX_REPOSITORY_URL = repo_url | |||||||||||||
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" | |||||||||||||
# Set a CratesListerState with a last commit value to force incremental case | |||||||||||||
last_commit_state = CratesListerState( | |||||||||||||
Done Inline Actionshardcoding this commit id in the test means it needs to be recomputing every time swh/lister/crates/tests/data/fake_crates_repository_init.sh runs. You need to either make swh/lister/crates/tests/data/fake_crates_repository_init.sh fully deterministic (enforce dates and author info), or compute this id dynamically. vlorentz: hardcoding this commit id in the test means it needs to be recomputing every time… | |||||||||||||
Done Inline Actions
I have used a different strategy in tests using dulwich to compute the commit id dynamically franckbret: > hardcoding this commit id in the test means it needs to be recomputing every time… | |||||||||||||
last_commit="e323097462f41d6ce969a0b7d2e11cdf354be407" | |||||||||||||
) | |||||||||||||
lister.state = last_commit_state | |||||||||||||
Done Inline Actions
using iteration variables outside the loop is a little unsettling vlorentz: using iteration variables outside the loop is a little unsettling | |||||||||||||
res = lister.run() | |||||||||||||
assert res.pages == 2 | |||||||||||||
assert res.origins == 2 | |||||||||||||
expected_origins_sorted = sorted( | |||||||||||||
expected_origins_incremental, key=lambda x: x.get("url") | |||||||||||||
) | |||||||||||||
scheduler_origins_sorted = sorted( | |||||||||||||
swh_scheduler.get_listed_origins(lister.lister_obj.id).results, | |||||||||||||
key=lambda x: x.url, | |||||||||||||
) | |||||||||||||
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): | |||||||||||||
assert scheduled.visit_type == "crates" | |||||||||||||
assert scheduled.url == expected.get("url") | |||||||||||||
assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( | |||||||||||||
"artifacts" | |||||||||||||
) | |||||||||||||
assert len(scheduler_origins_sorted) == len(expected_origins_sorted) | |||||||||||||
def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler): | |||||||||||||
"""Ensure incremental mode runs fine when the repository last commit is the same | |||||||||||||
than lister.state.las-_commit""" | |||||||||||||
archive_path = Path(datadir, "fake-crates-repository.tar.gz") | |||||||||||||
repo_url = prepare_repository_from_archive( | |||||||||||||
archive_path, "crates.io-index", tmp_path | |||||||||||||
) | |||||||||||||
lister = CratesLister(scheduler=swh_scheduler) | |||||||||||||
lister.INDEX_REPOSITORY_URL = repo_url | |||||||||||||
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" | |||||||||||||
lister.get_index_repository() | |||||||||||||
repo = Repo(lister.DESTINATION_PATH) | |||||||||||||
# Set a CratesListerState with a last commit value to force incremental case | |||||||||||||
last_commit_state = CratesListerState(last_commit=repo.head().decode()) | |||||||||||||
lister.state = last_commit_state | |||||||||||||
res = lister.run() | |||||||||||||
assert res.pages == 0 | |||||||||||||
assert res.origins == 0 | |||||||||||||
Done Inline Actions
first one doesn't really test anything vlorentz: first one doesn't really test anything |
Existing versions (0.1.0 and 0.1.1) are missing from the artifacts list. Wouldn't this cause them to be missing from the Snapshot object created by the loader?