diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py index 61ba7e2..bb06189 100644 --- a/swh/loader/bzr/loader.py +++ b/swh/loader/bzr/loader.py @@ -1,688 +1,705 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains a SWH loader for ingesting repository data from Bazaar or Breezy. """ from datetime import datetime from functools import lru_cache, partial import itertools import os from tempfile import mkdtemp from typing import Dict, Iterator, List, NewType, Optional, Set, Tuple, TypeVar, Union from breezy import errors as bzr_errors from breezy import repository, tsort from breezy.builtins import cmd_branch, cmd_upgrade from breezy.bzr import bzrdir from breezy.bzr.branch import Branch as BzrBranch from breezy.bzr.inventory import Inventory, InventoryEntry from breezy.bzr.inventorytree import InventoryTreeChange from breezy.revision import NULL_REVISION from breezy.revision import Revision as BzrRevision from breezy.tree import Tree from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout from swh.model import from_disk, swhids from swh.model.model import ( Content, ExtID, ObjectType, Origin, Person, Release, Revision, RevisionType, Sha1Git, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk" EXTID_TYPE = "bzr-nodeid" EXTID_VERSION: int = 1 BzrRevisionId = NewType("BzrRevisionId", bytes) T = TypeVar("T") # These are all the old Bazaar repository formats that we might encounter # in the wild. Bazaar's `clone` does not result in an upgrade, it needs to be # explicit. older_repository_formats = { b"Bazaar Knit Repository Format 3 (bzr 0.15)\n", b"Bazaar Knit Repository Format 4 (bzr 1.0)\n", b"Bazaar RepositoryFormatKnitPack5 (bzr 1.6)\n", b"Bazaar RepositoryFormatKnitPack5RichRoot (bzr 1.6)\n", b"Bazaar RepositoryFormatKnitPack5RichRoot (bzr 1.6.1)\n", b"Bazaar RepositoryFormatKnitPack6 (bzr 1.9)\n", b"Bazaar RepositoryFormatKnitPack6RichRoot (bzr 1.9)\n", b"Bazaar development format 2 with subtree support \ (needs bzr.dev from before 1.8)\n", b"Bazaar development format 8\n", b"Bazaar pack repository format 1 (needs bzr 0.92)\n", b"Bazaar pack repository format 1 with rich root (needs bzr 1.0)\n", b"Bazaar pack repository format 1 with subtree support (needs bzr 0.92)\n", b"Bazaar-NG Knit Repository Format 1", } # Latest one as of this time, unlikely to change expected_repository_format = b"Bazaar repository format 2a (needs bzr 1.16 or later)\n" class UnknownRepositoryFormat(Exception): """The repository we're trying to load is using an unknown format. It's possible (though unlikely) that a new format has come out, we should check before dismissing the repository as broken or unsupported.""" class BzrDirectory(from_disk.Directory): """A more practical directory. - creates missing parent directories - removes empty directories """ def __setitem__( self, path: bytes, value: Union[from_disk.Content, "BzrDirectory"] ) -> None: if b"/" in path: head, tail = path.split(b"/", 1) directory = self.get(head) if directory is None or isinstance(directory, from_disk.Content): directory = BzrDirectory() self[head] = directory directory[tail] = value else: super().__setitem__(path, value) def __delitem__(self, path: bytes) -> None: super().__delitem__(path) while b"/" in path: # remove empty parent directories path = path.rsplit(b"/", 1)[0] if len(self[path]) == 0: super().__delitem__(path) else: break def get( self, path: bytes, default: Optional[T] = None ) -> Optional[Union[from_disk.Content, "BzrDirectory", T]]: # TODO move to swh.model.from_disk.Directory try: return self[path] except KeyError: return default def sort_changes(change: InventoryTreeChange) -> str: """Key function for sorting the changes by path. Sorting allows us to group the folders together (for example "b", then "a/a", then "a/b"). Reversing this sort in the `sorted()` call will make it so the files appear before the folder ("a/a", then "a") if the folder has changed. This removes a bug where the order of operations is: - "a" goes from directory to file, removing all of its subtree - "a/a" is removed, but our structure has already forgotten it""" source_path, target_path = change.path # Neither path can be the empty string return source_path or target_path class BazaarLoader(BaseLoader): """Loads a Bazaar repository""" visit_type = "bzr" def __init__( self, storage: StorageInterface, url: str, directory: Optional[str] = None, logging_class: str = "swh.loader.bzr.Loader", visit_date: Optional[datetime] = None, temp_directory: str = "/tmp", clone_timeout_seconds: int = 7200, max_content_size: Optional[int] = None, ): super().__init__( storage=storage, logging_class=logging_class, max_content_size=max_content_size, ) self._temp_directory = temp_directory self._clone_timeout = clone_timeout_seconds self._revision_id_to_sha1git: Dict[BzrRevisionId, Sha1Git] = {} self._last_root = BzrDirectory() self._tags: Optional[Dict[bytes, BzrRevisionId]] = None self._head_revision_id: Optional[bytes] = None # Remember the previous revision to only compute the delta between # revisions self._prev_revision: Optional[BzrRevision] = None self._branch: Optional[BzrBranch] = None # Revisions that are pointed to, but don't exist in the current branch # Rare, but exist usually for cross-VCS references. self._ghosts: Set[BzrRevisionId] = set() # Exists if in an incremental run, is the latest saved revision from # this origin self._latest_head: Optional[BzrRevisionId] = None self._load_status = "eventful" self.origin_url = url self.visit_date = visit_date self.directory = directory self.repo: Optional[repository.Repository] = None def pre_cleanup(self) -> None: """As a first step, will try and check for dangling data to cleanup. This should do its best to avoid raising issues. """ clean_dangling_folders( self._temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log, ) def prepare_origin_visit(self) -> None: """First step executed by the loader to prepare origin and visit references. Set/update self.origin, and optionally self.origin_url, self.visit_date. """ self.origin = Origin(url=self.origin_url) def prepare(self) -> None: """Second step executed by the loader to prepare some state needed by the loader. """ latest_snapshot = snapshot_get_latest(self.storage, self.origin_url) if latest_snapshot: self._set_recorded_state(latest_snapshot) def load_status(self) -> Dict[str, str]: """Detailed loading status. Defaults to logging an eventful load. Returns: a dictionary that is eventually passed back as the task's result to the scheduler, allowing tuning of the task recurrence mechanism. """ return { "status": self._load_status, } def _set_recorded_state(self, latest_snapshot: Snapshot) -> None: if not latest_snapshot.branches: # Last snapshot was empty return head = latest_snapshot.branches[b"trunk"] bzr_head = self._get_extids_for_targets([head.target])[0].extid self._latest_head = BzrRevisionId(bzr_head) def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: """Get all Bzr ExtIDs for the targets in the latest snapshot""" extids = [] for extid in self.storage.extid_get_from_target( swhids.ObjectType.REVISION, targets, extid_type=EXTID_TYPE, extid_version=EXTID_VERSION, ): extids.append(extid) self._revision_id_to_sha1git[ BzrRevisionId(extid.extid) ] = extid.target.object_id if extids: # Filter out dangling extids, we need to load their target again revisions_missing = self.storage.revision_missing( [extid.target.object_id for extid in extids] ) extids = [ extid for extid in extids if extid.target.object_id not in revisions_missing ] return extids def cleanup(self) -> None: if self.repo is not None: self.repo.unlock() - def get_repo(self): - _, _, repo, _ = bzrdir.BzrDir.open_containing_tree_branch_or_repository( + def get_repo_and_branch(self) -> Tuple[repository.Repository, BzrBranch]: + _, branch, repo, _ = bzrdir.BzrDir.open_containing_tree_branch_or_repository( self._repo_directory ) - return repo + return repo, branch + + def run_upgrade(self): + """Upgrade both repository and branch to the most recent supported version + to be compatible with the loader.""" + cmd_upgrade().run(self._repo_directory, clean=True) def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading Returns: a value that is interpreted as a boolean. If True, fetch_data needs to be called again to complete loading. """ if not self.directory: # no local repository self._repo_directory = mkdtemp( prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix=f"-{os.getpid()}", dir=self._temp_directory, ) msg = "Cloning '%s' to '%s' with timeout %s seconds" self.log.debug( msg, self.origin_url, self._repo_directory, self._clone_timeout ) closure = partial( cmd_branch().run, self.origin_url, self._repo_directory, no_tree=True, use_existing_dir=True, ) clone_with_timeout( self.origin_url, self._repo_directory, closure, self._clone_timeout ) else: # existing local repository # Allow to load on disk repository without cloning # for testing purpose. self.log.debug("Using local directory '%s'", self.directory) self._repo_directory = self.directory - repo = self.get_repo() + repo, branch = self.get_repo_and_branch() repository_format = repo._format.as_string() # lies about being a string + if not repository_format == expected_repository_format: if repository_format in older_repository_formats: self.log.debug( "Upgrading repository from format '%s'", repository_format.decode("ascii").strip("\n"), ) - cmd_upgrade().run(self._repo_directory, clean=True) - repo = self.get_repo() + self.run_upgrade() + repo, branch = self.get_repo_and_branch() else: raise UnknownRepositoryFormat() + if not branch.supports_tags(): + # Some repos have the right format marker but their branches do not + # support tags + self.log.debug("Branch does not support tags, upgrading") + self.run_upgrade() + repo, branch = self.get_repo_and_branch() + # We could set the branch here directly, but we want to run the + # sanity checks in the `self.branch` property, so let's make sure + # we invalidate the "cache". + self._branch = None + self.repo = repo self.repo.lock_read() self.head_revision_id # set the property self.tags # set the property return False def store_data(self) -> None: """Store fetched data in the database.""" assert self.repo is not None assert self.tags is not None # Insert revisions using a topological sorting revs = self._get_bzr_revs_to_load() if revs and revs[0] == NULL_REVISION: # The first rev we load isn't necessarily `NULL_REVISION` even in a # full load, as bzr allows for ghost revisions. revs = revs[1:] length_ingested_revs = 0 for rev in revs: self.store_revision(self.repo.get_revision(rev)) length_ingested_revs += 1 if length_ingested_revs == 0: # no new revision ingested, so uneventful # still we'll make a snapshot, so we continue self._load_status = "uneventful" snapshot_branches: Dict[bytes, Optional[SnapshotBranch]] = {} for tag_name, target in self.tags.items(): label = b"tags/%s" % tag_name if target == NULL_REVISION: # Some very rare repositories have meaningless tags that point # to the null revision. self.log.debug("Tag '%s' points to the null revision", tag_name) snapshot_branches[label] = None continue try: # Used only to detect corruption self.branch.revision_id_to_dotted_revno(target) except ( bzr_errors.NoSuchRevision, bzr_errors.GhostRevisionsHaveNoRevno, bzr_errors.UnsupportedOperation, ): # Bad tag data/merges can lead to tagged revisions # which are not in this branch. We cannot point a tag there. snapshot_branches[label] = None continue snp_target = self._get_revision_id_from_bzr_id(target) snapshot_branches[label] = SnapshotBranch( target=self.store_release(tag_name, snp_target), target_type=TargetType.RELEASE, ) if self.head_revision_id != NULL_REVISION: head_revision_git_hash = self._get_revision_id_from_bzr_id( self.head_revision_id ) snapshot_branches[b"trunk"] = SnapshotBranch( target=head_revision_git_hash, target_type=TargetType.REVISION ) snapshot_branches[b"HEAD"] = SnapshotBranch( target=b"trunk", target_type=TargetType.ALIAS, ) snapshot = Snapshot(branches=snapshot_branches) self.storage.snapshot_add([snapshot]) self.flush() self.loaded_snapshot_id = snapshot.id def store_revision(self, bzr_rev: BzrRevision) -> None: self.log.debug("Storing revision '%s'", bzr_rev.revision_id) directory = self.store_directories(bzr_rev) associated_bugs = [ (b"bug", b"%s %s" % (status.encode(), url.encode())) for url, status in bzr_rev.iter_bugs() ] extra_headers = [ (b"time_offset_seconds", str(bzr_rev.timezone).encode(),), *associated_bugs, ] timestamp = Timestamp(int(bzr_rev.timestamp), 0) timezone = round(int(bzr_rev.timezone) / 60) date = TimestampWithTimezone.from_numeric_offset(timestamp, timezone, False) # TODO (how) should we store multiple authors? (T3887) revision = Revision( author=Person.from_fullname(bzr_rev.get_apparent_authors()[0].encode()), date=date, committer=Person.from_fullname(bzr_rev.committer.encode()), committer_date=date, type=RevisionType.BAZAAR, directory=directory, message=bzr_rev.message.encode(), extra_headers=extra_headers, synthetic=False, parents=self._get_revision_parents(bzr_rev), ) self._revision_id_to_sha1git[bzr_rev.revision_id] = revision.id self.storage.revision_add([revision]) self.storage.extid_add( [ ExtID( extid_type=EXTID_TYPE, extid_version=EXTID_VERSION, extid=bzr_rev.revision_id, target=revision.swhid(), ) ] ) def store_directories(self, bzr_rev: BzrRevision) -> Sha1Git: """Store a revision's directories.""" repo: repository.Repository = self.repo inventory: Inventory = repo.get_inventory(bzr_rev.revision_id) if self._prev_revision is None: self._store_directories_slow(bzr_rev, inventory) return self._store_tree(bzr_rev) old_tree = self._get_revision_tree(self._prev_revision.revision_id) new_tree = self._get_revision_tree(bzr_rev.revision_id) delta = new_tree.changes_from(old_tree) if delta.renamed or delta.copied: # Figuring out all nested and possibly conflicting renames is a lot # of effort for very few revisions, just go the slow way self._store_directories_slow(bzr_rev, inventory) return self._store_tree(bzr_rev) to_remove = sorted( delta.removed + delta.missing, key=sort_changes, reverse=True ) for change in to_remove: if change.kind[0] == "directory": # empty directories will delete themselves in `self._last_root` continue path = change.path[0] del self._last_root[path.encode()] # `delta.kind_changed` needs to happen before `delta.added` since a file # could be added under a node that changed from directory to file at the # same time, for example for change in itertools.chain(delta.kind_changed, delta.added, delta.modified): path = change.path[1] entry = inventory.get_entry(change.file_id) content = self.store_content(bzr_rev, path, entry) self._last_root[path.encode()] = content self._prev_revision = bzr_rev return self._store_tree(bzr_rev) def store_release(self, name: bytes, target: Sha1Git) -> Sha1Git: """Store a release given its name and its target. Args: name: name of the release. target: sha1_git of the target revision. Returns: the sha1_git of the stored release. """ release = Release( name=name, target=target, target_type=ObjectType.REVISION, message=None, metadata=None, synthetic=False, author=Person(name=None, email=None, fullname=b""), date=None, ) self.storage.release_add([release]) return release.id def store_content( self, bzr_rev: BzrRevision, file_path: str, entry: InventoryEntry ) -> from_disk.Content: if entry.executable: perms = from_disk.DentryPerms.executable_content elif entry.kind == "directory": perms = from_disk.DentryPerms.directory elif entry.kind == "symlink": perms = from_disk.DentryPerms.symlink elif entry.kind == "file": perms = from_disk.DentryPerms.content else: # pragma: no cover raise RuntimeError("Hit unreachable condition") data = b"" if entry.has_text(): rev_tree = self._get_revision_tree(bzr_rev.revision_id) data = rev_tree.get_file(file_path).read() assert len(data) == entry.text_size content = Content.from_data(data) self.storage.content_add([content]) return from_disk.Content({"sha1_git": content.sha1_git, "perms": perms}) def _get_bzr_revs_to_load(self) -> List[BzrRevision]: assert self.repo is not None repo: repository.Repository = self.repo self.log.debug("Getting fully sorted revision tree") if self.head_revision_id == NULL_REVISION: return [] head_revision = repo.get_revision(self.head_revision_id) # bazaar's model doesn't allow it to iterate on its graph from # the bottom lazily, but basically all DAGs (especially bzr ones) # are small enough to fit in RAM. ancestors_iter = self._iterate_ancestors(head_revision) ancestry = [] for rev, parents in ancestors_iter: if parents is None: # Filter out ghosts, they scare the `TopoSorter`. # Store them to later catch exceptions about missing parent revision self._ghosts.add(rev) continue ancestry.append((rev, parents)) sorter = tsort.TopoSorter(ancestry) all_revisions = sorter.sorted() if self._latest_head is not None: # Breezy does not offer a generic querying system, so we do the # filtering ourselves, which is simple enough given that bzr does # not have multiple heads per branch found = False new_revisions = [] # Filter out revisions until we reach the one we've already seen for rev in all_revisions: if not found: if rev == self._latest_head: found = True else: new_revisions.append(rev) if not found and all_revisions: # The previously saved head has been uncommitted, reload # everything msg = "Previous head (%s) not found, loading all revisions" self.log.debug(msg, self._latest_head) return all_revisions return new_revisions return all_revisions def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]: """Return an iterator of this revision's ancestors""" assert self.repo is not None return self.repo.get_graph().iter_ancestry([rev.revision_id]) # We want to cache at most the current revision and the last, no need to # take cache more than this. @lru_cache(maxsize=2) def _get_revision_tree(self, rev: BzrRevisionId) -> Tree: assert self.repo is not None return self.repo.revision_tree(rev) def _store_tree(self, bzr_rev: BzrRevision) -> Sha1Git: """Save the current in-memory tree to storage.""" directories: List[from_disk.Directory] = [self._last_root] while directories: directory = directories.pop() self.storage.directory_add([directory.to_model()]) directories.extend( [ item for item in directory.values() if isinstance(item, from_disk.Directory) ] ) self._prev_revision = bzr_rev return self._last_root.hash def _store_directories_slow( self, bzr_rev: BzrRevision, inventory: Inventory ) -> None: """Store a revision's directories. This is the slow variant: it does not use a diff from the last revision but lists all the files. It is used for the first revision of a load (the null revision for a full run, the last recorded head for an incremental one) or for cases where the headaches of figuring out the delta from the breezy primitives is not worth it. """ # Don't reuse the last root, we're listing everything anyway, and we # could be keeping around deleted files self._last_root = BzrDirectory() for path, entry in inventory.iter_entries(): if path == "": # root repo is created by default continue content = self.store_content(bzr_rev, path, entry) self._last_root[path.encode()] = content def _get_revision_parents(self, bzr_rev: BzrRevision) -> Tuple[Sha1Git, ...]: parents = [] for parent_id in bzr_rev.parent_ids: if parent_id == NULL_REVISION: # Paranoid, don't think that actually happens continue try: revision_id = self._get_revision_id_from_bzr_id(parent_id) except LookupError: if parent_id in self._ghosts: # We can't store ghosts in any meaningful way (yet?). They # have no contents by definition, and they're pretty rare, # so just ignore them. continue raise parents.append(revision_id) return tuple(parents) def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git: """Return the git sha1 of a revision given its bazaar revision id.""" from_cache = self._revision_id_to_sha1git.get(bzr_id) if from_cache is not None: return from_cache # The parent was not loaded in this run, get it from storage from_storage = self.storage.extid_get_from_extid( EXTID_TYPE, ids=[bzr_id], version=EXTID_VERSION ) if len(from_storage) != 1: msg = "Expected 1 match from storage for bzr node %r, got %d" raise LookupError(msg % (bzr_id.hex(), len(from_storage))) return from_storage[0].target.object_id @property def branch(self) -> BzrBranch: """Returns the only branch in the current repository. Bazaar branches can be assimilated to repositories in other VCS like Git or Mercurial. By contrast, a Bazaar repository is just a store of revisions to optimize disk usage, with no particular semantics.""" assert self.repo is not None branches = list(self.repo.find_branches(using=True)) msg = "Expected only 1 branch in the repository, got %d" assert len(branches) == 1, msg % len(branches) self._branch = branches[0] return branches[0] @property def head_revision_id(self) -> BzrRevisionId: """Returns the Bazaar revision id of the branch's head. Bazaar/Breezy branches do not have multiple heads.""" assert self.repo is not None if self._head_revision_id is None: self._head_revision_id = self.branch.last_revision() assert self._head_revision_id is not None return BzrRevisionId(self._head_revision_id) @property def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: assert self.repo is not None - if self._tags is None and self.branch.supports_tags(): + if self._tags is None: self._tags = { n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() } return self._tags diff --git a/swh/loader/bzr/tests/data/does-not-support-tags.sh b/swh/loader/bzr/tests/data/does-not-support-tags.sh new file mode 100644 index 0000000..b67c3b0 --- /dev/null +++ b/swh/loader/bzr/tests/data/does-not-support-tags.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Creates a repository with the correct recent format but an old branch +# format that does not support tags + +set -euo pipefail +bzr init-shared-repo does-not-support-tags-repo +cd does-not-support-tags-repo +bzr init --knit does-not-support-tags-branch +cd .. \ No newline at end of file diff --git a/swh/loader/bzr/tests/data/does-not-support-tags.tgz b/swh/loader/bzr/tests/data/does-not-support-tags.tgz new file mode 100644 index 0000000..39b4de7 Binary files /dev/null and b/swh/loader/bzr/tests/data/does-not-support-tags.tgz differ diff --git a/swh/loader/bzr/tests/test_loader.py b/swh/loader/bzr/tests/test_loader.py index 60dc06f..399c9ec 100644 --- a/swh/loader/bzr/tests/test_loader.py +++ b/swh/loader/bzr/tests/test_loader.py @@ -1,411 +1,430 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from pathlib import Path from breezy.builtins import cmd_uncommit import pytest from swh.loader.bzr.loader import BazaarLoader, BzrDirectory from swh.loader.tests import ( assert_last_visit_matches, get_stats, prepare_repository_from_archive, ) from swh.model.from_disk import Content from swh.model.hashutil import hash_to_bytes from swh.storage.algos.snapshot import snapshot_get_latest # Generated repositories: # - needs-upgrade: # - Repository needs upgrade # - empty: # - Empty repo # - renames: # - File rename # - Directory renames # - Directory renames *and* file rename conflicting # - no-branch: # - No branch # - metadata-and-type-changes: # - Directory removed # - Kind changed (file to symlink, directory to file, etc.) # - not changed_content and not renamed and not kind_changed (so, exec file?) # - Executable file # - Empty commit (bzr commit --unchanged) # - ghosts # - Ghost revisions # - broken-tags # - Tags corruption +# - does-not-support-tags +# - Repo is recent but branch does not support tags, needs upgraded # TODO tests: # - Root path listed in changes (does that even happen?) # - Parent is :null (does that even happen?) # - Case insensitive removal (Is it actually a problem?) # - Truly corrupted revision? # - No match from storage (wrong topo sort or broken rev) def do_uncommit(repo_url): """Remove the latest revision from the given bzr repo""" uncommit_cmd = cmd_uncommit() with open(os.devnull, "w") as f: uncommit_cmd.outf = f uncommit_cmd.run(repo_url) @pytest.mark.parametrize("do_clone", [False, True]) def test_nominal(swh_storage, datadir, tmp_path, do_clone): archive_path = Path(datadir, "nominal.tgz") repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) if do_clone: # Check that the cloning mechanism works loader = BazaarLoader(swh_storage, repo_url) else: loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") snapshot = snapshot_get_latest(swh_storage, repo_url) expected_branches = [ b"HEAD", b"tags/0.1", b"tags/latest", b"tags/other-tag", b"trunk", ] assert sorted(snapshot.branches.keys()) == expected_branches stats = get_stats(swh_storage) assert stats == { "content": 7, "directory": 7, "origin": 1, "origin_visit": 1, "release": 3, "revision": 6, "skipped_content": 0, "snapshot": 1, } # It contains associated bugs, making it a good complete candidate example_revision = hash_to_bytes("18bb5b2c866c10c58a191afcd0b450a8727f1c62") revision = loader.storage.revision_get([example_revision])[0] assert revision.to_dict() == { "message": b"fixing bugs", "author": { "fullname": b"Rapha\xc3\xabl Gom\xc3\xa8s ", "name": b"Rapha\xc3\xabl Gom\xc3\xa8s", "email": b"alphare@alphare-carbon.lan", }, "committer": { "fullname": b"Rapha\xc3\xabl Gom\xc3\xa8s ", "name": b"Rapha\xc3\xabl Gom\xc3\xa8s", "email": b"alphare@alphare-carbon.lan", }, "date": { "timestamp": {"seconds": 1643302390, "microseconds": 0}, "offset": 60, "negative_utc": False, "offset_bytes": b"+0100", }, "committer_date": { "timestamp": {"seconds": 1643302390, "microseconds": 0}, "offset": 60, "negative_utc": False, "offset_bytes": b"+0100", }, "type": "bzr", "directory": b"s0\xf3pe\xa3\x12\x05{\xc7\xbc\x86\xa6\x14.\xc1b\x1c\xeb\x05", "synthetic": False, "metadata": None, "parents": (b"*V\xf5\n\xf0?\x1d{kE4\xda(\xb1\x08R\x83\x87-\xb6",), "id": example_revision, "extra_headers": ( (b"time_offset_seconds", b"3600"), (b"bug", b"fixed https://launchpad.net/bugs/1234"), (b"bug", b"fixed https://bz.example.com/?show_bug=4321"), ), } def test_needs_upgrade(swh_storage, datadir, tmp_path, mocker): """Old bzr repository format should be upgraded to latest format""" archive_path = Path(datadir, "needs-upgrade.tgz") repo_url = prepare_repository_from_archive(archive_path, "needs-upgrade", tmp_path) - res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + upgrade_spy = mocker.spy(loader, "run_upgrade") + res = loader.load() + upgrade_spy.assert_called() assert res == {"status": "uneventful"} # needs-upgrade is an empty repo +def test_does_not_support_tags(swh_storage, datadir, tmp_path, mocker): + """Repository format is correct, but the branch itself does not support tags + and should be upgraded to the latest format""" + archive_path = Path(datadir, "does-not-support-tags.tgz") + path = "does-not-support-tags-repo/does-not-support-tags-branch" + repo_url = prepare_repository_from_archive(archive_path, path, tmp_path,) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + upgrade_spy = mocker.spy(loader, "run_upgrade") + res = loader.load() + upgrade_spy.assert_called() + assert res == {"status": "uneventful"} # does-not-support-tags is an empty repo + + def test_no_branch(swh_storage, datadir, tmp_path): """This should only happen with a broken clone, so the expected result is failure""" archive_path = Path(datadir, "no-branch.tgz") repo_url = prepare_repository_from_archive(archive_path, "no-branch", tmp_path) res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "failed"} def test_empty(swh_storage, datadir, tmp_path): """An empty repository is fine, it's just got no information""" archive_path = Path(datadir, "empty.tgz") repo_url = prepare_repository_from_archive(archive_path, "empty", tmp_path) res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "uneventful"} # Empty snapshot does not bother the incremental code res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "uneventful"} def test_renames(swh_storage, datadir, tmp_path): archive_path = Path(datadir, "renames.tgz") repo_url = prepare_repository_from_archive(archive_path, "renames", tmp_path) res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "eventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") snapshot = snapshot_get_latest(swh_storage, repo_url) assert sorted(snapshot.branches.keys()) == [ b"HEAD", b"trunk", ] stats = get_stats(swh_storage) assert stats == { "content": 1, "directory": 5, "origin": 1, "origin_visit": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 1, } def test_broken_tags(swh_storage, datadir, tmp_path): """A tag pointing to a the null revision should not break anything""" archive_path = Path(datadir, "broken-tags.tgz") repo_url = prepare_repository_from_archive(archive_path, "broken-tags", tmp_path) res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "uneventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") snapshot = snapshot_get_latest(swh_storage, repo_url) assert sorted(snapshot.branches.keys()) == [ b"tags/null-tag", # broken tag does appear, but didn't cause any issues ] stats = get_stats(swh_storage) assert stats == { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, # Does not count as a valid release "revision": 0, "skipped_content": 0, "snapshot": 1, } def test_metadata_and_type_changes(swh_storage, datadir, tmp_path): archive_path = Path(datadir, "metadata-and-type-changes.tgz") repo_url = prepare_repository_from_archive( archive_path, "metadata-and-type-changes", tmp_path ) res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() assert res == {"status": "eventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") snapshot = snapshot_get_latest(swh_storage, repo_url) assert sorted(snapshot.branches.keys()) == [ b"HEAD", b"trunk", ] stats = get_stats(swh_storage) assert stats == { "content": 1, "directory": 9, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } def test_ghosts(swh_storage, datadir, tmp_path): archive_path = Path(datadir, "ghosts.tgz") repo_url = prepare_repository_from_archive(archive_path, "ghosts", tmp_path) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) assert loader._ghosts == set() res = loader.load() assert loader._ghosts == set((b"iamaghostboo",)) assert res == {"status": "eventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") snapshot = snapshot_get_latest(swh_storage, repo_url) assert sorted(snapshot.branches.keys()) == [ b"HEAD", b"tags/brokentag", # tag pointing to a ghost revision is tracked b"trunk", ] stats = get_stats(swh_storage) assert stats == { "content": 0, # No contents "directory": 1, # Root directory always counts "origin": 1, "origin_visit": 1, "release": 0, # Ghost tag is ignored, stored as dangling "revision": 1, # Only one revision, the ghost is ignored "skipped_content": 0, "snapshot": 1, } def test_bzr_directory(): directory = BzrDirectory() directory[b"a/decently/enough/nested/path"] = Content(b"whatever") directory[b"a/decently/other_node"] = Content(b"whatever else") directory[b"another_node"] = Content(b"contents") assert directory[b"a/decently/enough/nested/path"] == Content(b"whatever") assert directory[b"a/decently/other_node"] == Content(b"whatever else") assert directory[b"another_node"] == Content(b"contents") del directory[b"a/decently/enough/nested/path"] assert directory.get(b"a/decently/enough/nested/path") is None assert directory.get(b"a/decently/enough/nested/") is None assert directory.get(b"a/decently/enough") is None # no KeyError directory[b"a/decently"] directory[b"a"] directory[b"another_node"] def test_incremental_noop(swh_storage, datadir, tmp_path): """Check that nothing happens if we try to load a repo twice in a row""" archive_path = Path(datadir, "nominal.tgz") repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "uneventful"} def test_incremental_nominal(swh_storage, datadir, tmp_path): """Check that an updated repository does update after the second run, but is still a noop in the third run.""" archive_path = Path(datadir, "nominal.tgz") repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) # remove 2 latest commits do_uncommit(repo_url) do_uncommit(repo_url) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} stats = get_stats(swh_storage) assert stats == { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 4, "skipped_content": 0, "snapshot": 1, } # Load the complete repo now repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} stats = get_stats(swh_storage) expected_stats = { "content": 7, "directory": 7, "origin": 1, "origin_visit": 2, "release": 3, "revision": 6, "skipped_content": 0, "snapshot": 2, } assert stats == expected_stats # Nothing should change loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "uneventful"} stats = get_stats(swh_storage) assert stats == {**expected_stats, "origin_visit": 2 + 1} def test_incremental_uncommitted_head(swh_storage, datadir, tmp_path): """Check that doing an incremental run with the saved head missing does not error out but instead loads everything correctly""" archive_path = Path(datadir, "nominal.tgz") repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} stats = get_stats(swh_storage) expected_stats = { "content": 7, "directory": 7, "origin": 1, "origin_visit": 1, "release": 3, "revision": 6, "skipped_content": 0, "snapshot": 1, } assert stats == expected_stats # Remove the previously saved head do_uncommit(repo_url) loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) res = loader.load() assert res == {"status": "eventful"} # Everything is loaded correctly stats = get_stats(swh_storage) assert stats == {**expected_stats, "origin_visit": 1 + 1, "snapshot": 1 + 1}