diff --git a/.gitignore b/.gitignore new file mode 100644 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +*.pyc +*.sw? +*~ +/.coverage +/.coverage.* +.eggs/ +__pycache__ +*.egg-info/ +version.txt +build/ +dist/ +.tox/ +.mypy_cache/ +.vscode/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,4 +48,3 @@ # entry: pydocstyle --convention=google # language: python # types: [python] - diff --git a/swh/foo/loader.py b/conftest.py rename from swh/foo/loader.py rename to conftest.py --- a/swh/foo/loader.py +++ b/conftest.py @@ -1,8 +1,10 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -"""This document contains a SWH loader for ingesting repository data -from Bazaar or Breezy. -""" +pytest_plugins = [ + "swh.scheduler.pytest_plugin", + "swh.storage.pytest_plugin", + "swh.loader.pytest_plugin", +] diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -13,3 +13,6 @@ # [mypy-add_your_lib_here.*] # ignore_missing_imports = True + +[mypy-breezy.*] +ignore_missing_imports = True diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,4 @@ -# Add here internal Software Heritage dependencies, one per line. -swh.core[http] >= 0.3 # [http] is required by swh.core.pytest_plugin +swh.model >= 2.6.1 +swh.storage >= 0.41.1 +swh.scheduler >= 0.23.0 +swh.loader.core >= 2.3.0 diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,5 @@ pytest +pytest-mock +swh.core[http] >= 0.0.61 +swh.scheduler[testing] >= 0.5.0 +swh.storage[testing] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html +breezy diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -55,7 +55,12 @@ use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, - entry_points="""""", + entry_points=""" + [swh.workers] + loader.bzr=swh.loader.bzr:register + [console_scripts] + swh-bzr-identify=swh.loader.bzr.identify:main + """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", @@ -66,7 +71,7 @@ project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", - "Source": "https://forge.softwareheritage.org/source/swh-loader-bzr", - "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-bzr/", + "Source": ("https://forge.softwareheritage.org/source/swh-loader-bzr"), + "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-bzr/", # NoQA: E501 }, ) diff --git a/swh/foo/tests/__init__.py b/swh/foo/tests/__init__.py deleted file mode 100644 diff --git a/swh/foo/tests/test_loader.py b/swh/foo/tests/test_loader.py deleted file mode 100644 --- a/swh/foo/tests/test_loader.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_nothing(): - # Placeholder; remove this when we add actual tests - pass diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/__init__.py @@ -0,0 +1,3 @@ +from pkgutil import extend_path + +__path__ = extend_path(__path__, __name__) diff --git a/swh/loader/bzr/__init__.py b/swh/loader/bzr/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/bzr/__init__.py @@ -0,0 +1,16 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + from swh.loader.bzr.loader import BazaarLoader + + return { + "task_modules": ["%s.tasks" % __name__], + "loader": BazaarLoader, + } diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/bzr/loader.py @@ -0,0 +1,539 @@ +# Copyright (C) 2021-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""This document contains a SWH loader for ingesting repository data +from Bazaar or Breezy. +""" +from datetime import datetime +from functools import lru_cache, partial +import os +from tempfile import mkdtemp +from typing import Dict, Iterator, List, NewType, Optional, Set, TypeVar, Union + +from breezy import errors as bzr_errors +from breezy import repository, tsort +from breezy.builtins import cmd_clone +from breezy.bzr import bzrdir +from breezy.bzr.branch import Branch as BzrBranch +from breezy.bzr.inventory import Inventory, InventoryEntry +from breezy.revision import NULL_REVISION +from breezy.revision import Revision as BzrRevision + +from swh.loader.core.loader import BaseLoader +from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout +from swh.model import from_disk +from swh.model.model import ( + Content, + ExtID, + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) +from swh.storage.interface import StorageInterface + +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk" +EXTID_TYPE = "bzr-nodeid" +EXTID_VERSION: int = 1 + +BzrRevisionId = NewType("BzrRevisionId", bytes) + +T = TypeVar("T") + +# These are all the old Bazaar repository formats that we might encounter +# in the wild. Bazaar's `clone` does not result in an upgrade, it needs to be +# explicit. +older_repository_formats = { + b"Bazaar Knit Repository Format 3 (bzr 0.15)\n", + b"Bazaar Knit Repository Format 4 (bzr 1.0)\n", + b"Bazaar RepositoryFormatKnitPack5 (bzr 1.6)\n", + b"Bazaar RepositoryFormatKnitPack5RichRoot (bzr 1.6)\n", + b"Bazaar RepositoryFormatKnitPack5RichRoot (bzr 1.6.1)\n", + b"Bazaar RepositoryFormatKnitPack6 (bzr 1.9)\n", + b"Bazaar RepositoryFormatKnitPack6RichRoot (bzr 1.9)\n", + b"Bazaar development format 2 with subtree support \ + (needs bzr.dev from before 1.8)\n", + b"Bazaar development format 8\n", + b"Bazaar pack repository format 1 (needs bzr 0.92)\n", + b"Bazaar pack repository format 1 with rich root (needs bzr 1.0)\n", + b"Bazaar pack repository format 1 with subtree support (needs bzr 0.92)\n", + b"Bazaar-NG Knit Repository Format 1", +} + +# Latest one as of this time, unlikely to change +expected_repository_format = b"Bazaar repository format 2a (needs bzr 1.16 or later)\n" + + +class RepositoryNeedsUpgrade(Exception): + """The repository we're trying to load is using an old format. + We only support format 2a (the most recent), see `brz help upgrade`""" + + +class UnknownRepositoryFormat(Exception): + """The repository we're trying to load is using an unknown format. + It's possible (though unlikely) that a new format has come out, we should + check before dismissing the repository as broken or unsupported.""" + + +class BzrDirectory(from_disk.Directory): + """A more practical directory. + + - creates missing parent directories + - removes empty directories + """ + + def __setitem__( + self, path: bytes, value: Union[from_disk.Content, "BzrDirectory"] + ) -> None: + if b"/" in path: + head, tail = path.split(b"/", 1) + + directory = self.get(head) + if directory is None or isinstance(directory, from_disk.Content): + directory = BzrDirectory() + self[head] = directory + + directory[tail] = value + else: + super().__setitem__(path, value) + + def __delitem__(self, path: bytes) -> None: + super().__delitem__(path) + + while b"/" in path: # remove empty parent directories + path = path.rsplit(b"/", 1)[0] + if len(self[path]) == 0: + super().__delitem__(path) + else: + break + + def get( + self, path: bytes, default: Optional[T] = None + ) -> Optional[Union[from_disk.Content, "BzrDirectory", T]]: + # TODO move to swh.model.from_disk.Directory + try: + return self[path] + except KeyError: + return default + + +class BazaarLoader(BaseLoader): + """Loads a Bazaar repository""" + + visit_type = "bzr" + + def __init__( + self, + storage: StorageInterface, + url: str, + directory: Optional[str] = None, + logging_class: str = "swh.loader.bzr.Loader", + visit_date: Optional[datetime] = None, + temp_directory: str = "/tmp", + clone_timeout_seconds: int = 7200, + max_content_size: Optional[int] = None, + ): + super().__init__( + storage=storage, + logging_class=logging_class, + max_content_size=max_content_size, + ) + + self._temp_directory = temp_directory + self._clone_timeout = clone_timeout_seconds + self._revision_id_to_sha1git: Dict[BzrRevisionId, Sha1Git] = {} + self._last_root = BzrDirectory() + self._tags: Optional[Dict[bytes, BzrRevisionId]] = None + self._head_revision_id: Optional[bytes] = None + self._branch: Optional[BzrBranch] = None + # Revisions that are pointed to, but don't exist in the current branch + # Rare, but exist usually for cross-VCS references. + self._ghosts: Set[BzrRevisionId] = set() + self._load_status = "eventful" + + self.origin_url = url + self.visit_date = visit_date + self.directory = directory + self.repo: Optional[repository.Repository] = None + + def pre_cleanup(self) -> None: + """As a first step, will try and check for dangling data to cleanup. + This should do its best to avoid raising issues. + + """ + clean_dangling_folders( + self._temp_directory, + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log, + ) + + def prepare_origin_visit(self) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self) -> None: + """Second step executed by the loader to prepare some state needed by + the loader. + """ + + def load_status(self) -> Dict[str, str]: + """Detailed loading status. + + Defaults to logging an eventful load. + + Returns: a dictionary that is eventually passed back as the task's + result to the scheduler, allowing tuning of the task recurrence + mechanism. + """ + return { + "status": self._load_status, + } + + def cleanup(self) -> None: + if self.repo is not None: + self.repo.unlock() + + def fetch_data(self) -> bool: + """Fetch the data from the source the loader is currently loading + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + + """ + if not self.directory: # no local repository + self._repo_directory = mkdtemp( + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f"-{os.getpid()}", + dir=self._temp_directory, + ) + msg = "Cloning '%s' to '%s' with timeout %s seconds" + self.log.debug( + msg, self.origin_url, self._repo_directory, self._clone_timeout + ) + closure = partial(cmd_clone().run, self.origin_url, self._repo_directory) + clone_with_timeout( + self.origin_url, self._repo_directory, closure, self._clone_timeout + ) + else: # existing local repository + # Allow to load on disk repository without cloning + # for testing purpose. + self.log.debug("Using local directory '%s'", self.directory) + self._repo_directory = self.directory + + res = bzrdir.BzrDir.open_containing_tree_branch_or_repository( + self._repo_directory + ) + (_tree, _branch, repo, _relpath) = res + repository_format = repo._format.as_string() # lies about being a string + if not repository_format == expected_repository_format: + if repository_format in older_repository_formats: + raise RepositoryNeedsUpgrade() + else: + raise UnknownRepositoryFormat() + + self.repo = repo + self.repo.lock_read() + self.head_revision_id # set the property + self.tags # set the property + return False + + def store_data(self): + """Store fetched data in the database.""" + # Insert revisions using a topological sorting + revs = self._get_bzr_revs_to_load() + + if revs and revs[0] == NULL_REVISION: + # The first rev we load isn't necessarily `NULL_REVISION` even in a + # full load, as bzr allows for ghost revisions. + revs = revs[1:] + + length_ingested_revs = 0 + for rev in revs: + self.store_revision(self.repo.get_revision(rev)) + length_ingested_revs += 1 + + if length_ingested_revs == 0: + # no new revision ingested, so uneventful + # still we'll make a snapshot, so we continue + self._load_status = "uneventful" + + snapshot_branches: Dict[bytes, SnapshotBranch] = {} + + for tag_name, target in self.tags.items(): + label = b"tags/%s" % tag_name + if target == NULL_REVISION: + # Some very rare repositories have meaningless tags that point + # to the null revision. + self.log.debug("Tag '%s' points to the null revision", tag_name) + snapshot_branches[label] = None + continue + try: + # Used only to detect corruption + self.branch.revision_id_to_dotted_revno(target) + except ( + bzr_errors.NoSuchRevision, + bzr_errors.GhostRevisionsHaveNoRevno, + bzr_errors.UnsupportedOperation, + ): + # Bad tag data/merges can lead to tagged revisions + # which are not in this branch. We cannot point a tag there. + snapshot_branches[label] = None + continue + target = self._get_revision_id_from_bzr_id(target) + snapshot_branches[label] = SnapshotBranch( + target=self.store_release(tag_name, target), + target_type=TargetType.RELEASE, + ) + + if self.head_revision_id != NULL_REVISION: + head_revision_git_hash = self._get_revision_id_from_bzr_id( + self.head_revision_id + ) + snapshot_branches[b"trunk"] = SnapshotBranch( + target=head_revision_git_hash, target_type=TargetType.REVISION + ) + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=b"trunk", target_type=TargetType.ALIAS, + ) + + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + + self.flush() + self.loaded_snapshot_id = snapshot.id + + def store_revision(self, bzr_rev: BzrRevision): + self.log.debug("Storing revision '%s'", bzr_rev.revision_id) + directory = self.store_directories(bzr_rev) + associated_bugs = [ + (b"bug", b"%s %s" % (status.encode(), url.encode())) + for url, status in bzr_rev.iter_bugs() + ] + extra_headers = [ + (b"time_offset_seconds", str(bzr_rev.timezone).encode(),), + *associated_bugs, + ] + timestamp = Timestamp(int(bzr_rev.timestamp), 0) + timezone = round(int(bzr_rev.timezone) / 60) + date = TimestampWithTimezone.from_numeric_offset(timestamp, timezone, False) + + # TODO (how) should we store multiple authors? (T3887) + revision = Revision( + author=Person.from_fullname(bzr_rev.get_apparent_authors()[0].encode()), + date=date, + committer=Person.from_fullname(bzr_rev.committer.encode()), + committer_date=date, + type=RevisionType.BAZAAR, + directory=directory, + message=bzr_rev.message.encode(), + extra_headers=extra_headers, + synthetic=False, + parents=self._get_revision_parents(bzr_rev), + ) + + self._revision_id_to_sha1git[bzr_rev.revision_id] = revision.id + self.storage.revision_add([revision]) + + self.storage.extid_add( + [ + ExtID( + extid_type=EXTID_TYPE, + extid_version=EXTID_VERSION, + extid=bzr_rev.revision_id, + target=revision.swhid(), + ) + ] + ) + + def store_directories(self, bzr_rev: BzrRevision) -> Sha1Git: + repo: repository.Repository = self.repo + inventory: Inventory = repo.get_inventory(bzr_rev.revision_id) + self._store_directories_slow(bzr_rev, inventory) + return self._store_tree(inventory) + + def store_release(self, name: bytes, target: Sha1Git) -> Sha1Git: + """Store a release given its name and its target. + + Args: + name: name of the release. + target: sha1_git of the target revision. + + Returns: + the sha1_git of the stored release. + """ + release = Release( + name=name, + target=target, + target_type=ObjectType.REVISION, + message=None, + metadata=None, + synthetic=False, + author=Person(name=None, email=None, fullname=b""), + date=None, + ) + + self.storage.release_add([release]) + + return release.id + + def store_content( + self, bzr_rev: BzrRevision, file_path: str, entry: InventoryEntry + ) -> from_disk.Content: + if entry.executable: + perms = from_disk.DentryPerms.executable_content + elif entry.kind == "directory": + perms = from_disk.DentryPerms.directory + elif entry.kind == "symlink": + perms = from_disk.DentryPerms.symlink + elif entry.kind == "file": + perms = from_disk.DentryPerms.content + else: # pragma: no cover + raise RuntimeError("Hit unreachable condition") + + data = b"" + if entry.has_text(): + rev_tree = self._get_revision_tree(bzr_rev.revision_id) + data = rev_tree.get_file(file_path).read() + assert len(data) == entry.text_size + + content = Content.from_data(data) + + self.storage.content_add([content]) + + return from_disk.Content({"sha1_git": content.sha1_git, "perms": perms}) + + def _get_bzr_revs_to_load(self) -> List[BzrRevision]: + assert self.repo is not None + repo: repository.Repository = self.repo + self.log.debug("Getting fully sorted revision tree") + if self.head_revision_id == NULL_REVISION: + return [] + head_revision = repo.get_revision(self.head_revision_id) + # bazaar's model doesn't allow it to iterate on its graph from + # the bottom lazily, but basically all DAGs (especially bzr ones) + # are small enough to fit in RAM. + ancestors_iter = self._iterate_ancestors(head_revision) + ancestry = [] + for rev, parents in ancestors_iter: + if parents is None: + # Filter out ghosts, they scare the `TopoSorter`. + # Store them to later catch exceptions about missing parent revision + self._ghosts.add(rev) + continue + ancestry.append((rev, parents)) + + sorter = tsort.TopoSorter(ancestry) + return sorter.sorted() + + def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]: + """Return an iterator of this revision's ancestors""" + assert self.repo is not None + return self.repo.get_graph().iter_ancestry([rev.revision_id]) + + @lru_cache() + def _get_revision_tree(self, rev: BzrRevisionId): + assert self.repo is not None + return self.repo.revision_tree(rev) + + def _store_tree(self, inventory: Inventory) -> Sha1Git: + """Save the current in-memory tree to storage.""" + directories: List[from_disk.Directory] = [self._last_root] + while directories: + directory = directories.pop() + self.storage.directory_add([directory.to_model()]) + directories.extend( + [ + item + for item in directory.values() + if isinstance(item, from_disk.Directory) + ] + ) + self._prev_inventory = inventory + return self._last_root.hash + + def _store_directories_slow(self, bzr_rev: BzrRevision, inventory: Inventory): + """Store a revision directories given its hg nodeid. + + This is the slow variant: it does not use a diff from the last revision + but lists all the files. A future patch will introduce a faster version. + """ + # Don't reuse the last root, we're listing everything anyway, and we + # could be keeping around deleted files + self._last_root = BzrDirectory() + for path, entry in inventory.iter_entries(): + if path == "": + # root repo is created by default + continue + content = self.store_content(bzr_rev, path, entry) + self._last_root[path.encode()] = content + + def _get_revision_parents(self, bzr_rev: BzrRevision): + parents = [] + for parent_id in bzr_rev.parent_ids: + if parent_id == NULL_REVISION: + # Paranoid, don't think that actually happens + continue + try: + revision_id = self._get_revision_id_from_bzr_id(parent_id) + except LookupError: + if parent_id in self._ghosts: + # We can't store ghosts in any meaningful way (yet?). They + # have no contents by definition, and they're pretty rare, + # so just ignore them. + continue + raise + parents.append(revision_id) + + return tuple(parents) + + def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git: + """Return the git sha1 of a revision given its bazaar revision id.""" + return self._revision_id_to_sha1git[bzr_id] + + @property + def branch(self) -> BzrBranch: + """Returns the only branch in the current repository. + + Bazaar branches can be assimilated to repositories in other VCS like + Git or Mercurial. By contrast, a Bazaar repository is just a store of + revisions to optimize disk usage, with no particular semantics.""" + assert self.repo is not None + branches = list(self.repo.find_branches(using=True)) + msg = "Expected only 1 branch in the repository, got %d" + assert len(branches) == 1, msg % len(branches) + self._branch = branches[0] + return branches[0] + + @property + def head_revision_id(self) -> bytes: + """Returns the Bazaar revision id of the branch's head. + + Bazaar/Breezy branches do not have multiple heads.""" + assert self.repo is not None + if self._head_revision_id is None: + self._head_revision_id = self.branch.last_revision() + return self._head_revision_id + + @property + def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: + assert self.repo is not None + if self._tags is None and self.branch.supports_tags(): + self._tags = { + n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() + } + return self._tags diff --git a/swh/foo/py.typed b/swh/loader/bzr/py.typed rename from swh/foo/py.typed rename to swh/loader/bzr/py.typed diff --git a/swh/foo/__init__.py b/swh/loader/bzr/tests/__init__.py rename from swh/foo/__init__.py rename to swh/loader/bzr/tests/__init__.py diff --git a/swh/loader/bzr/tests/conftest.py b/swh/loader/bzr/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/loader/bzr/tests/conftest.py @@ -0,0 +1,39 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict + +import pytest + + +@pytest.fixture +def swh_storage_backend_config(swh_storage_backend_config): + """Basic pg storage configuration with no journal collaborator + (to avoid pulling optional dependency on clients of this fixture) + + """ + return { + "cls": "filter", + "storage": { + "cls": "buffer", + "min_batch_size": { + "content": 10, + "content_bytes": 100 * 1024 * 1024, + "directory": 10, + "revision": 10, + "release": 10, + }, + "storage": swh_storage_backend_config, + }, + } + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config, tmp_path) -> Dict[str, Any]: + return { + "storage": swh_storage_backend_config, + "max_content_size": 104857600, + "temp_directory": str(tmp_path), + } diff --git a/swh/loader/bzr/tests/data/broken-tags.sh b/swh/loader/bzr/tests/data/broken-tags.sh new file mode 100644 --- /dev/null +++ b/swh/loader/bzr/tests/data/broken-tags.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Creates a Bazaar repository with a tag pointing to the null revision +# Requires Breezy 3.2+ +set -euo pipefail + +bzr init broken-tags +cd broken-tags +bzr tag null-tag +cd .. \ No newline at end of file diff --git a/swh/loader/bzr/tests/data/broken-tags.tgz b/swh/loader/bzr/tests/data/broken-tags.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ a.txt +mkdir dir +echo "contents\nhere" > dir/b.txt +mkdir empty-dir +echo c > dir/c +echo d > d +bzr add * +bzr commit -m "Initial commit" +cd .. +bzr branch nominal nominal-branch +cd nominal-branch +echo "other text" >> a.txt +bzr add * +bzr commit -m "Modified a \nThis change happened in another branch" +cd ../nominal +bzr merge ../nominal-branch +bzr commit -m merge +ln -s dir link +bzr add * +bzr commit -m "Add symlink" +rm d +bzr commit -m "deleted d" +bzr tag -r 2 0.1 +bzr tag -r 2 other-tag +bzr tag -r 4 latest +echo fix-bug >> dir/b.txt +bzr config bugtracker_bz_url="https://bz.example.com/?show_bug={id}" +bzr commit -m "fixing bugs" --fixes lp:1234 --fixes bz:4321 \ No newline at end of file diff --git a/swh/loader/bzr/tests/data/nominal.tgz b/swh/loader/bzr/tests/data/nominal.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@", + "name": b"Rapha\xc3\xabl Gom\xc3\xa8s", + "email": b"alphare@alphare-carbon.lan", + }, + "committer": { + "fullname": b"Rapha\xc3\xabl Gom\xc3\xa8s ", + "name": b"Rapha\xc3\xabl Gom\xc3\xa8s", + "email": b"alphare@alphare-carbon.lan", + }, + "date": { + "timestamp": {"seconds": 1643302390, "microseconds": 0}, + "offset": 60, + "negative_utc": False, + "offset_bytes": b"+0100", + }, + "committer_date": { + "timestamp": {"seconds": 1643302390, "microseconds": 0}, + "offset": 60, + "negative_utc": False, + "offset_bytes": b"+0100", + }, + "type": "bzr", + "directory": b"s0\xf3pe\xa3\x12\x05{\xc7\xbc\x86\xa6\x14.\xc1b\x1c\xeb\x05", + "synthetic": False, + "metadata": None, + "parents": (b"*V\xf5\n\xf0?\x1d{kE4\xda(\xb1\x08R\x83\x87-\xb6",), + "id": example_revision, + "extra_headers": ( + (b"time_offset_seconds", b"3600"), + (b"bug", b"fixed https://launchpad.net/bugs/1234"), + (b"bug", b"fixed https://bz.example.com/?show_bug=4321"), + ), + } + + +def test_needs_upgrade(swh_storage, datadir, tmp_path, mocker): + archive_path = Path(datadir, "needs-upgrade.tgz") + repo_url = prepare_repository_from_archive(archive_path, "needs-upgrade", tmp_path) + + init_spy = mocker.spy(loader_mod.RepositoryNeedsUpgrade, "__init__") + init_spy.assert_not_called() + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + init_spy.assert_called() + assert res == {"status": "failed"} + + +def test_no_branch(swh_storage, datadir, tmp_path): + """This should only happen with a broken clone, so the expected result is failure""" + archive_path = Path(datadir, "no-branch.tgz") + repo_url = prepare_repository_from_archive(archive_path, "no-branch", tmp_path) + + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + assert res == {"status": "failed"} + + +def test_empty(swh_storage, datadir, tmp_path): + """An empty repository is fine, it's just got no information""" + archive_path = Path(datadir, "empty.tgz") + repo_url = prepare_repository_from_archive(archive_path, "empty", tmp_path) + + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + assert res == {"status": "uneventful"} + + +def test_renames(swh_storage, datadir, tmp_path): + archive_path = Path(datadir, "renames.tgz") + repo_url = prepare_repository_from_archive(archive_path, "renames", tmp_path) + + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + assert res == {"status": "eventful"} + + assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") + + snapshot = snapshot_get_latest(swh_storage, repo_url) + + assert sorted(snapshot.branches.keys()) == [ + b"HEAD", + b"trunk", + ] + + stats = get_stats(swh_storage) + assert stats == { + "content": 1, + "directory": 5, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 2, + "skipped_content": 0, + "snapshot": 1, + } + + +def test_broken_tags(swh_storage, datadir, tmp_path): + """A tag pointing to a the null revision should not break anything""" + archive_path = Path(datadir, "broken-tags.tgz") + repo_url = prepare_repository_from_archive(archive_path, "broken-tags", tmp_path) + + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + assert res == {"status": "uneventful"} + + assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") + + snapshot = snapshot_get_latest(swh_storage, repo_url) + + assert sorted(snapshot.branches.keys()) == [ + b"tags/null-tag", # broken tag does appear, but didn't cause any issues + ] + + stats = get_stats(swh_storage) + assert stats == { + "content": 0, + "directory": 0, + "origin": 1, + "origin_visit": 1, + "release": 0, # Does not count as a valid release + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } + + +def test_metadata_and_type_changes(swh_storage, datadir, tmp_path): + archive_path = Path(datadir, "metadata-and-type-changes.tgz") + repo_url = prepare_repository_from_archive( + archive_path, "metadata-and-type-changes", tmp_path + ) + + res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load() + assert res == {"status": "eventful"} + + assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") + + snapshot = snapshot_get_latest(swh_storage, repo_url) + + assert sorted(snapshot.branches.keys()) == [ + b"HEAD", + b"trunk", + ] + + stats = get_stats(swh_storage) + assert stats == { + "content": 1, + "directory": 9, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + + +def test_ghosts(swh_storage, datadir, tmp_path): + archive_path = Path(datadir, "ghosts.tgz") + repo_url = prepare_repository_from_archive(archive_path, "ghosts", tmp_path) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + assert loader._ghosts == set() + res = loader.load() + assert loader._ghosts == set((b"iamaghostboo",)) + assert res == {"status": "eventful"} + + assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr") + + snapshot = snapshot_get_latest(swh_storage, repo_url) + + assert sorted(snapshot.branches.keys()) == [ + b"HEAD", + b"tags/brokentag", # tag pointing to a ghost revision is tracked + b"trunk", + ] + + stats = get_stats(swh_storage) + assert stats == { + "content": 0, # No contents + "directory": 1, # Root directory always counts + "origin": 1, + "origin_visit": 1, + "release": 0, # Ghost tag is ignored, stored as dangling + "revision": 1, # Only one revision, the ghost is ignored + "skipped_content": 0, + "snapshot": 1, + } + + +def test_bzr_directory(): + directory = BzrDirectory() + directory[b"a/decently/enough/nested/path"] = Content(b"whatever") + directory[b"a/decently/other_node"] = Content(b"whatever else") + directory[b"another_node"] = Content(b"contents") + + assert directory[b"a/decently/enough/nested/path"] == Content(b"whatever") + assert directory[b"a/decently/other_node"] == Content(b"whatever else") + assert directory[b"another_node"] == Content(b"contents") + + del directory[b"a/decently/enough/nested/path"] + assert directory.get(b"a/decently/enough/nested/path") is None + assert directory.get(b"a/decently/enough/nested/") is None + assert directory.get(b"a/decently/enough") is None + + # no KeyError + directory[b"a/decently"] + directory[b"a"] + directory[b"another_node"] diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -8,8 +8,8 @@ pytest-cov commands = pytest --doctest-modules \ - {envsitepackagesdir}/swh/foo \ - --cov={envsitepackagesdir}/swh/foo \ + {envsitepackagesdir}/swh/loader/bzr \ + --cov={envsitepackagesdir}/swh/loader/bzr \ --cov-branch {posargs} [testenv:black]