Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/bzr/loader.py
# Copyright (C) 2021-2022 The Software Heritage developers | # Copyright (C) 2021-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""This document contains a SWH loader for ingesting repository data | """This document contains a SWH loader for ingesting repository data | ||||
from Bazaar or Breezy. | from Bazaar or Breezy. | ||||
""" | """ | ||||
from datetime import datetime | from datetime import datetime | ||||
from functools import lru_cache, partial | from functools import lru_cache, partial | ||||
import itertools | import itertools | ||||
import os | import os | ||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
from typing import Dict, Iterator, List, NewType, Optional, Set, TypeVar, Union | from typing import Dict, Iterator, List, NewType, Optional, Set, Tuple, TypeVar, Union | ||||
from breezy import errors as bzr_errors | from breezy import errors as bzr_errors | ||||
from breezy import repository, tsort | from breezy import repository, tsort | ||||
from breezy.builtins import cmd_branch | from breezy.builtins import cmd_branch | ||||
from breezy.bzr import bzrdir | from breezy.bzr import bzrdir | ||||
from breezy.bzr.branch import Branch as BzrBranch | from breezy.bzr.branch import Branch as BzrBranch | ||||
from breezy.bzr.inventory import Inventory, InventoryEntry | from breezy.bzr.inventory import Inventory, InventoryEntry | ||||
from breezy.bzr.inventorytree import InventoryTreeChange | from breezy.bzr.inventorytree import InventoryTreeChange | ||||
▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines | def fetch_data(self) -> bool: | ||||
raise UnknownRepositoryFormat() | raise UnknownRepositoryFormat() | ||||
self.repo = repo | self.repo = repo | ||||
self.repo.lock_read() | self.repo.lock_read() | ||||
self.head_revision_id # set the property | self.head_revision_id # set the property | ||||
self.tags # set the property | self.tags # set the property | ||||
return False | return False | ||||
def store_data(self): | def store_data(self) -> None: | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
assert self.repo is not None | |||||
assert self.tags is not None | |||||
# Insert revisions using a topological sorting | # Insert revisions using a topological sorting | ||||
revs = self._get_bzr_revs_to_load() | revs = self._get_bzr_revs_to_load() | ||||
if revs and revs[0] == NULL_REVISION: | if revs and revs[0] == NULL_REVISION: | ||||
# The first rev we load isn't necessarily `NULL_REVISION` even in a | # The first rev we load isn't necessarily `NULL_REVISION` even in a | ||||
# full load, as bzr allows for ghost revisions. | # full load, as bzr allows for ghost revisions. | ||||
revs = revs[1:] | revs = revs[1:] | ||||
length_ingested_revs = 0 | length_ingested_revs = 0 | ||||
for rev in revs: | for rev in revs: | ||||
self.store_revision(self.repo.get_revision(rev)) | self.store_revision(self.repo.get_revision(rev)) | ||||
length_ingested_revs += 1 | length_ingested_revs += 1 | ||||
if length_ingested_revs == 0: | if length_ingested_revs == 0: | ||||
# no new revision ingested, so uneventful | # no new revision ingested, so uneventful | ||||
# still we'll make a snapshot, so we continue | # still we'll make a snapshot, so we continue | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, Optional[SnapshotBranch]] = {} | ||||
for tag_name, target in self.tags.items(): | for tag_name, target in self.tags.items(): | ||||
label = b"tags/%s" % tag_name | label = b"tags/%s" % tag_name | ||||
if target == NULL_REVISION: | if target == NULL_REVISION: | ||||
# Some very rare repositories have meaningless tags that point | # Some very rare repositories have meaningless tags that point | ||||
# to the null revision. | # to the null revision. | ||||
self.log.debug("Tag '%s' points to the null revision", tag_name) | self.log.debug("Tag '%s' points to the null revision", tag_name) | ||||
snapshot_branches[label] = None | snapshot_branches[label] = None | ||||
continue | continue | ||||
try: | try: | ||||
# Used only to detect corruption | # Used only to detect corruption | ||||
self.branch.revision_id_to_dotted_revno(target) | self.branch.revision_id_to_dotted_revno(target) | ||||
except ( | except ( | ||||
bzr_errors.NoSuchRevision, | bzr_errors.NoSuchRevision, | ||||
bzr_errors.GhostRevisionsHaveNoRevno, | bzr_errors.GhostRevisionsHaveNoRevno, | ||||
bzr_errors.UnsupportedOperation, | bzr_errors.UnsupportedOperation, | ||||
): | ): | ||||
# Bad tag data/merges can lead to tagged revisions | # Bad tag data/merges can lead to tagged revisions | ||||
# which are not in this branch. We cannot point a tag there. | # which are not in this branch. We cannot point a tag there. | ||||
snapshot_branches[label] = None | snapshot_branches[label] = None | ||||
continue | continue | ||||
target = self._get_revision_id_from_bzr_id(target) | snp_target = self._get_revision_id_from_bzr_id(target) | ||||
snapshot_branches[label] = SnapshotBranch( | snapshot_branches[label] = SnapshotBranch( | ||||
target=self.store_release(tag_name, target), | target=self.store_release(tag_name, snp_target), | ||||
target_type=TargetType.RELEASE, | target_type=TargetType.RELEASE, | ||||
) | ) | ||||
if self.head_revision_id != NULL_REVISION: | if self.head_revision_id != NULL_REVISION: | ||||
head_revision_git_hash = self._get_revision_id_from_bzr_id( | head_revision_git_hash = self._get_revision_id_from_bzr_id( | ||||
self.head_revision_id | self.head_revision_id | ||||
) | ) | ||||
snapshot_branches[b"trunk"] = SnapshotBranch( | snapshot_branches[b"trunk"] = SnapshotBranch( | ||||
target=head_revision_git_hash, target_type=TargetType.REVISION | target=head_revision_git_hash, target_type=TargetType.REVISION | ||||
) | ) | ||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[b"HEAD"] = SnapshotBranch( | ||||
target=b"trunk", target_type=TargetType.ALIAS, | target=b"trunk", target_type=TargetType.ALIAS, | ||||
) | ) | ||||
snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
def store_revision(self, bzr_rev: BzrRevision): | def store_revision(self, bzr_rev: BzrRevision) -> None: | ||||
self.log.debug("Storing revision '%s'", bzr_rev.revision_id) | self.log.debug("Storing revision '%s'", bzr_rev.revision_id) | ||||
directory = self.store_directories(bzr_rev) | directory = self.store_directories(bzr_rev) | ||||
associated_bugs = [ | associated_bugs = [ | ||||
(b"bug", b"%s %s" % (status.encode(), url.encode())) | (b"bug", b"%s %s" % (status.encode(), url.encode())) | ||||
for url, status in bzr_rev.iter_bugs() | for url, status in bzr_rev.iter_bugs() | ||||
] | ] | ||||
extra_headers = [ | extra_headers = [ | ||||
(b"time_offset_seconds", str(bzr_rev.timezone).encode(),), | (b"time_offset_seconds", str(bzr_rev.timezone).encode(),), | ||||
▲ Show 20 Lines • Show All 190 Lines • ▼ Show 20 Lines | def _store_tree(self, bzr_rev: BzrRevision) -> Sha1Git: | ||||
item | item | ||||
for item in directory.values() | for item in directory.values() | ||||
if isinstance(item, from_disk.Directory) | if isinstance(item, from_disk.Directory) | ||||
] | ] | ||||
) | ) | ||||
self._prev_revision = bzr_rev | self._prev_revision = bzr_rev | ||||
return self._last_root.hash | return self._last_root.hash | ||||
def _store_directories_slow(self, bzr_rev: BzrRevision, inventory: Inventory): | def _store_directories_slow( | ||||
self, bzr_rev: BzrRevision, inventory: Inventory | |||||
) -> None: | |||||
"""Store a revision's directories. | """Store a revision's directories. | ||||
This is the slow variant: it does not use a diff from the last revision | This is the slow variant: it does not use a diff from the last revision | ||||
but lists all the files. It is used for the first revision of a load | but lists all the files. It is used for the first revision of a load | ||||
(the null revision for a full run, the last recorded head for an | (the null revision for a full run, the last recorded head for an | ||||
incremental one) or for cases where the headaches of figuring out the | incremental one) or for cases where the headaches of figuring out the | ||||
delta from the breezy primitives is not worth it. | delta from the breezy primitives is not worth it. | ||||
""" | """ | ||||
# Don't reuse the last root, we're listing everything anyway, and we | # Don't reuse the last root, we're listing everything anyway, and we | ||||
# could be keeping around deleted files | # could be keeping around deleted files | ||||
self._last_root = BzrDirectory() | self._last_root = BzrDirectory() | ||||
for path, entry in inventory.iter_entries(): | for path, entry in inventory.iter_entries(): | ||||
if path == "": | if path == "": | ||||
# root repo is created by default | # root repo is created by default | ||||
continue | continue | ||||
content = self.store_content(bzr_rev, path, entry) | content = self.store_content(bzr_rev, path, entry) | ||||
self._last_root[path.encode()] = content | self._last_root[path.encode()] = content | ||||
def _get_revision_parents(self, bzr_rev: BzrRevision): | def _get_revision_parents(self, bzr_rev: BzrRevision) -> Tuple[Sha1Git, ...]: | ||||
parents = [] | parents = [] | ||||
for parent_id in bzr_rev.parent_ids: | for parent_id in bzr_rev.parent_ids: | ||||
if parent_id == NULL_REVISION: | if parent_id == NULL_REVISION: | ||||
# Paranoid, don't think that actually happens | # Paranoid, don't think that actually happens | ||||
continue | continue | ||||
try: | try: | ||||
revision_id = self._get_revision_id_from_bzr_id(parent_id) | revision_id = self._get_revision_id_from_bzr_id(parent_id) | ||||
except LookupError: | except LookupError: | ||||
Show All 32 Lines | def branch(self) -> BzrBranch: | ||||
assert self.repo is not None | assert self.repo is not None | ||||
branches = list(self.repo.find_branches(using=True)) | branches = list(self.repo.find_branches(using=True)) | ||||
msg = "Expected only 1 branch in the repository, got %d" | msg = "Expected only 1 branch in the repository, got %d" | ||||
assert len(branches) == 1, msg % len(branches) | assert len(branches) == 1, msg % len(branches) | ||||
self._branch = branches[0] | self._branch = branches[0] | ||||
return branches[0] | return branches[0] | ||||
@property | @property | ||||
def head_revision_id(self) -> bytes: | def head_revision_id(self) -> BzrRevisionId: | ||||
"""Returns the Bazaar revision id of the branch's head. | """Returns the Bazaar revision id of the branch's head. | ||||
Bazaar/Breezy branches do not have multiple heads.""" | Bazaar/Breezy branches do not have multiple heads.""" | ||||
assert self.repo is not None | assert self.repo is not None | ||||
if self._head_revision_id is None: | if self._head_revision_id is None: | ||||
self._head_revision_id = self.branch.last_revision() | self._head_revision_id = self.branch.last_revision() | ||||
return self._head_revision_id | assert self._head_revision_id is not None | ||||
return BzrRevisionId(self._head_revision_id) | |||||
@property | @property | ||||
def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: | def tags(self) -> Optional[Dict[bytes, BzrRevisionId]]: | ||||
assert self.repo is not None | assert self.repo is not None | ||||
if self._tags is None and self.branch.supports_tags(): | if self._tags is None and self.branch.supports_tags(): | ||||
self._tags = { | self._tags = { | ||||
n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() | n.encode(): r for n, r in self.branch.tags.get_tag_dict().items() | ||||
} | } | ||||
return self._tags | return self._tags |