Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/bzr/loader.py
| # Copyright (C) 2021-2022 The Software Heritage developers | # Copyright (C) 2021-2022 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| """This document contains a SWH loader for ingesting repository data | """This document contains a SWH loader for ingesting repository data | ||||
| from Bazaar or Breezy. | from Bazaar or Breezy. | ||||
| """ | """ | ||||
| from datetime import datetime | from datetime import datetime | ||||
| from functools import lru_cache, partial | from functools import lru_cache, partial | ||||
| import itertools | import itertools | ||||
| import os | import os | ||||
| from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
| from typing import Dict, Iterator, List, NewType, Optional, Set, TypeVar, Union | from typing import Dict, Iterator, List, NewType, Optional, Set, Tuple, TypeVar, Union | ||||
| from breezy import errors as bzr_errors | from breezy import errors as bzr_errors | ||||
| from breezy import repository, tsort | from breezy import repository, tsort | ||||
| from breezy.builtins import cmd_branch | from breezy.builtins import cmd_branch | ||||
| from breezy.bzr import bzrdir | from breezy.bzr import bzrdir | ||||
| from breezy.bzr.branch import Branch as BzrBranch | from breezy.bzr.branch import Branch as BzrBranch | ||||
| from breezy.bzr.inventory import Inventory, InventoryEntry | from breezy.bzr.inventory import Inventory, InventoryEntry | ||||
| from breezy.bzr.inventorytree import InventoryTreeChange | from breezy.bzr.inventorytree import InventoryTreeChange | ||||
| ▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines | def fetch_data(self) -> bool: | ||||
| raise UnknownRepositoryFormat() | raise UnknownRepositoryFormat() | ||||
| self.repo = repo | self.repo = repo | ||||
| self.repo.lock_read() | self.repo.lock_read() | ||||
| self.head_revision_id # set the property | self.head_revision_id # set the property | ||||
| self.tags # set the property | self.tags # set the property | ||||
| return False | return False | ||||
| def store_data(self): | def store_data(self) -> None: | ||||
| """Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
| assert self.repo is not None | |||||
| assert self.tags is not None | |||||
| # Insert revisions using a topological sorting | # Insert revisions using a topological sorting | ||||
| revs = self._get_bzr_revs_to_load() | revs = self._get_bzr_revs_to_load() | ||||
| if revs and revs[0] == NULL_REVISION: | if revs and revs[0] == NULL_REVISION: | ||||
| # The first rev we load isn't necessarily `NULL_REVISION` even in a | # The first rev we load isn't necessarily `NULL_REVISION` even in a | ||||
| # full load, as bzr allows for ghost revisions. | # full load, as bzr allows for ghost revisions. | ||||
| revs = revs[1:] | revs = revs[1:] | ||||
| length_ingested_revs = 0 | length_ingested_revs = 0 | ||||
| for rev in revs: | for rev in revs: | ||||
| self.store_revision(self.repo.get_revision(rev)) | self.store_revision(self.repo.get_revision(rev)) | ||||
| length_ingested_revs += 1 | length_ingested_revs += 1 | ||||
| if length_ingested_revs == 0: | if length_ingested_revs == 0: | ||||
| # no new revision ingested, so uneventful | # no new revision ingested, so uneventful | ||||
| # still we'll make a snapshot, so we continue | # still we'll make a snapshot, so we continue | ||||
| self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
| snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, Optional[SnapshotBranch]] = {} | ||||
| for tag_name, target in self.tags.items(): | for tag_name, target in self.tags.items(): | ||||
| label = b"tags/%s" % tag_name | label = b"tags/%s" % tag_name | ||||
| if target == NULL_REVISION: | if target == NULL_REVISION: | ||||
| # Some very rare repositories have meaningless tags that point | # Some very rare repositories have meaningless tags that point | ||||
| # to the null revision. | # to the null revision. | ||||
| self.log.debug("Tag '%s' points to the null revision", tag_name) | self.log.debug("Tag '%s' points to the null revision", tag_name) | ||||
| snapshot_branches[label] = None | snapshot_branches[label] = None | ||||
| continue | continue | ||||
| try: | try: | ||||
| # Used only to detect corruption | # Used only to detect corruption | ||||
| self.branch.revision_id_to_dotted_revno(target) | self.branch.revision_id_to_dotted_revno(target) | ||||
| except ( | except ( | ||||
| bzr_errors.NoSuchRevision, | bzr_errors.NoSuchRevision, | ||||
| bzr_errors.GhostRevisionsHaveNoRevno, | bzr_errors.GhostRevisionsHaveNoRevno, | ||||
| bzr_errors.UnsupportedOperation, | bzr_errors.UnsupportedOperation, | ||||
| ): | ): | ||||
| # Bad tag data/merges can lead to tagged revisions | # Bad tag data/merges can lead to tagged revisions | ||||
| # which are not in this branch. We cannot point a tag there. | # which are not in this branch. We cannot point a tag there. | ||||
| snapshot_branches[label] = None | snapshot_branches[label] = None | ||||
| continue | continue | ||||
| target = self._get_revision_id_from_bzr_id(target) | snp_target = self._get_revision_id_from_bzr_id(target) | ||||
| snapshot_branches[label] = SnapshotBranch( | snapshot_branches[label] = SnapshotBranch( | ||||
| target=self.store_release(tag_name, target), | target=self.store_release(tag_name, snp_target), | ||||
| target_type=TargetType.RELEASE, | target_type=TargetType.RELEASE, | ||||
| ) | ) | ||||
| if self.head_revision_id != NULL_REVISION: | if self.head_revision_id != NULL_REVISION: | ||||
| head_revision_git_hash = self._get_revision_id_from_bzr_id( | head_revision_git_hash = self._get_revision_id_from_bzr_id( | ||||
| self.head_revision_id | self.head_revision_id | ||||
| ) | ) | ||||
| snapshot_branches[b"trunk"] = SnapshotBranch( | snapshot_branches[b"trunk"] = SnapshotBranch( | ||||
| target=head_revision_git_hash, target_type=TargetType.REVISION | target=head_revision_git_hash, target_type=TargetType.REVISION | ||||
| ) | ) | ||||
| snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[b"HEAD"] = SnapshotBranch( | ||||
| target=b"trunk", target_type=TargetType.ALIAS, | target=b"trunk", target_type=TargetType.ALIAS, | ||||
| ) | ) | ||||
| snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
| self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
| self.flush() | self.flush() | ||||
| self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
| def store_revision(self, bzr_rev: BzrRevision): | def store_revision(self, bzr_rev: BzrRevision) -> None: | ||||
| self.log.debug("Storing revision '%s'", bzr_rev.revision_id) | self.log.debug("Storing revision '%s'", bzr_rev.revision_id) | ||||
| directory = self.store_directories(bzr_rev) | directory = self.store_directories(bzr_rev) | ||||
| associated_bugs = [ | associated_bugs = [ | ||||
| (b"bug", b"%s %s" % (status.encode(), url.encode())) | (b"bug", b"%s %s" % (status.encode(), url.encode())) | ||||
| for url, status in bzr_rev.iter_bugs() | for url, status in bzr_rev.iter_bugs() | ||||
| ] | ] | ||||
| extra_headers = [ | extra_headers = [ | ||||
| (b"time_offset_seconds", str(bzr_rev.timezone).encode(),), | (b"time_offset_seconds", str(bzr_rev.timezone).encode(),), | ||||
| ▲ Show 20 Lines • Show All 190 Lines • ▼ Show 20 Lines | def _store_tree(self, bzr_rev: BzrRevision) -> Sha1Git: | ||||
| item | item | ||||
| for item in directory.values() | for item in directory.values() | ||||
| if isinstance(item, from_disk.Directory) | if isinstance(item, from_disk.Directory) | ||||
| ] | ] | ||||
| ) | ) | ||||
| self._prev_revision = bzr_rev | self._prev_revision = bzr_rev | ||||
| return self._last_root.hash | return self._last_root.hash | ||||
| def _store_directories_slow(self, bzr_rev: BzrRevision, inventory: Inventory): | def _store_directories_slow( | ||||
| self, bzr_rev: BzrRevision, inventory: Inventory | |||||
| ) -> None: | |||||
| """Store a revision's directories. | """Store a revision's directories. | ||||
| This is the slow variant: it does not use a diff from the last revision | This is the slow variant: it does not use a diff from the last revision | ||||
| but lists all the files. It is used for the first revision of a load | but lists all the files. It is used for the first revision of a load | ||||
| (the null revision for a full run, the last recorded head for an | (the null revision for a full run, the last recorded head for an | ||||
| incremental one) or for cases where the headaches of figuring out the | incremental one) or for cases where the headaches of figuring out the | ||||
| delta from the breezy primitives is not worth it. | delta from the breezy primitives is not worth it. | ||||
| """ | """ | ||||
| # Don't reuse the last root, we're listing everything anyway, and we | # Don't reuse the last root, we're listing everything anyway, and we | ||||
| # could be keeping around deleted files | # could be keeping around deleted files | ||||
| self._last_root = BzrDirectory() | self._last_root = BzrDirectory() | ||||
| for path, entry in inventory.iter_entries(): | for path, entry in inventory.iter_entries(): | ||||
| if path == "": | if path == "": | ||||
| # root repo is created by default | # root repo is created by default | ||||
| continue | continue | ||||
| content = self.store_content(bzr_rev, path, entry) | content = self.store_content(bzr_rev, path, entry) | ||||
| self._last_root[path.encode()] = content | self._last_root[path.encode()] = content | ||||
| def _get_revision_parents(self, bzr_rev: BzrRevision): | def _get_revision_parents(self, bzr_rev: BzrRevision) -> Tuple[Sha1Git, ...]: | ||||
| parents = [] | parents = [] | ||||
| for parent_id in bzr_rev.parent_ids: | for parent_id in bzr_rev.parent_ids: | ||||
| if parent_id == NULL_REVISION: | if parent_id == NULL_REVISION: | ||||
| # Paranoid, don't think that actually happens | # Paranoid, don't think that actually happens | ||||
| continue | continue | ||||
| try: | try: | ||||
| revision_id = self._get_revision_id_from_bzr_id(parent_id) | revision_id = self._get_revision_id_from_bzr_id(parent_id) | ||||
| except LookupError: | except LookupError: | ||||
| Show All 32 Lines | def branch(self) -> BzrBranch: | ||||
| assert self.repo is not None | assert self.repo is not None | ||||
| branches = list(self.repo.find_branches(using=True)) | branches = list(self.repo.find_branches(using=True)) | ||||
| msg = "Expected only 1 branch in the repository, got %d" | msg = "Expected only 1 branch in the repository, got %d" | ||||
| assert len(branches) == 1, msg % len(branches) | assert len(branches) == 1, msg % len(branches) | ||||
| self._branch = branches[0] | self._branch = branches[0] | ||||
| return branches[0] | return branches[0] | ||||
| @property | @property | ||||
| def head_revision_id(self) -> bytes: | def head_revision_id(self) -> BzrRevisionId: | ||||
| """Returns the Bazaar revision id of the branch's head. | """Returns the Bazaar revision id of the branch's head. | ||||
| Bazaar/Breezy branches do not have multiple heads.""" | Bazaar/Breezy branches do not have multiple heads.""" | ||||
| assert self.repo is not None | assert self.repo is not None | ||||
| if self._head_revision_id is None: | if self._head_revision_id is None: | ||||
| self._head_revision_id = self.branch.last_revision() | self._head_revision_id = self.branch.last_revision() | ||||
| return self._head_revision_id | assert self._head_revision_id is not None | ||||
| return BzrRevisionId(self._head_revision_id) | |||||
| @property | @property | ||||
| def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: | def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: | ||||
| assert self.repo is not None | assert self.repo is not None | ||||
| if self._tags is None and self.branch.supports_tags(): | if self._tags is None and self.branch.supports_tags(): | ||||
| self._tags = { | self._tags = { | ||||
| n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() | n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() | ||||
| } | } | ||||
| return self._tags | return self._tags | ||||