Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/bzr/loader.py
Show All 17 Lines | |||||||||||||
from breezy.bzr import bzrdir | from breezy.bzr import bzrdir | ||||||||||||
from breezy.bzr.branch import Branch as BzrBranch | from breezy.bzr.branch import Branch as BzrBranch | ||||||||||||
from breezy.bzr.inventory import Inventory, InventoryEntry | from breezy.bzr.inventory import Inventory, InventoryEntry | ||||||||||||
from breezy.revision import NULL_REVISION | from breezy.revision import NULL_REVISION | ||||||||||||
from breezy.revision import Revision as BzrRevision | from breezy.revision import Revision as BzrRevision | ||||||||||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||||||||||
from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout | from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout | ||||||||||||
from swh.model import from_disk | from swh.model import from_disk, swhids | ||||||||||||
from swh.model.model import ( | from swh.model.model import ( | ||||||||||||
Content, | Content, | ||||||||||||
ExtID, | ExtID, | ||||||||||||
ObjectType, | ObjectType, | ||||||||||||
Origin, | Origin, | ||||||||||||
Person, | Person, | ||||||||||||
Release, | Release, | ||||||||||||
Revision, | Revision, | ||||||||||||
RevisionType, | RevisionType, | ||||||||||||
Sha1Git, | Sha1Git, | ||||||||||||
Snapshot, | Snapshot, | ||||||||||||
SnapshotBranch, | SnapshotBranch, | ||||||||||||
TargetType, | TargetType, | ||||||||||||
Timestamp, | Timestamp, | ||||||||||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||||||||||
) | ) | ||||||||||||
from swh.storage.algos.snapshot import snapshot_get_latest | |||||||||||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | ||||||||||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk" | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk" | ||||||||||||
EXTID_TYPE = "bzr-nodeid" | EXTID_TYPE = "bzr-nodeid" | ||||||||||||
EXTID_VERSION: int = 1 | EXTID_VERSION: int = 1 | ||||||||||||
BzrRevisionId = NewType("BzrRevisionId", bytes) | BzrRevisionId = NewType("BzrRevisionId", bytes) | ||||||||||||
▲ Show 20 Lines • Show All 103 Lines • ▼ Show 20 Lines | ): | ||||||||||||
self._revision_id_to_sha1git: Dict[BzrRevisionId, Sha1Git] = {} | self._revision_id_to_sha1git: Dict[BzrRevisionId, Sha1Git] = {} | ||||||||||||
self._last_root = BzrDirectory() | self._last_root = BzrDirectory() | ||||||||||||
self._tags: Optional[Dict[bytes, BzrRevisionId]] = None | self._tags: Optional[Dict[bytes, BzrRevisionId]] = None | ||||||||||||
self._head_revision_id: Optional[bytes] = None | self._head_revision_id: Optional[bytes] = None | ||||||||||||
self._branch: Optional[BzrBranch] = None | self._branch: Optional[BzrBranch] = None | ||||||||||||
# Revisions that are pointed to, but don't exist in the current branch | # Revisions that are pointed to, but don't exist in the current branch | ||||||||||||
# Rare, but exist usually for cross-VCS references. | # Rare, but exist usually for cross-VCS references. | ||||||||||||
self._ghosts: Set[BzrRevisionId] = set() | self._ghosts: Set[BzrRevisionId] = set() | ||||||||||||
# Exists if in an incremental run, is the latest saved revision from | |||||||||||||
# this origin | |||||||||||||
self._latest_head: Optional[BzrRevisionId] = None | |||||||||||||
self._load_status = "eventful" | self._load_status = "eventful" | ||||||||||||
self.origin_url = url | self.origin_url = url | ||||||||||||
self.visit_date = visit_date | self.visit_date = visit_date | ||||||||||||
self.directory = directory | self.directory = directory | ||||||||||||
self.repo: Optional[repository.Repository] = None | self.repo: Optional[repository.Repository] = None | ||||||||||||
def pre_cleanup(self) -> None: | def pre_cleanup(self) -> None: | ||||||||||||
Show All 14 Lines | def prepare_origin_visit(self) -> None: | ||||||||||||
""" | """ | ||||||||||||
self.origin = Origin(url=self.origin_url) | self.origin = Origin(url=self.origin_url) | ||||||||||||
def prepare(self) -> None: | def prepare(self) -> None: | ||||||||||||
"""Second step executed by the loader to prepare some state needed by | """Second step executed by the loader to prepare some state needed by | ||||||||||||
the loader. | the loader. | ||||||||||||
""" | """ | ||||||||||||
latest_snapshot = snapshot_get_latest(self.storage, self.origin_url) | |||||||||||||
if latest_snapshot: | |||||||||||||
self._set_recorded_state(latest_snapshot) | |||||||||||||
def load_status(self) -> Dict[str, str]: | def load_status(self) -> Dict[str, str]: | ||||||||||||
"""Detailed loading status. | """Detailed loading status. | ||||||||||||
Defaults to logging an eventful load. | Defaults to logging an eventful load. | ||||||||||||
Returns: a dictionary that is eventually passed back as the task's | Returns: a dictionary that is eventually passed back as the task's | ||||||||||||
result to the scheduler, allowing tuning of the task recurrence | result to the scheduler, allowing tuning of the task recurrence | ||||||||||||
mechanism. | mechanism. | ||||||||||||
""" | """ | ||||||||||||
return { | return { | ||||||||||||
"status": self._load_status, | "status": self._load_status, | ||||||||||||
} | } | ||||||||||||
def _set_recorded_state(self, latest_snapshot: Snapshot) -> None: | |||||||||||||
head = latest_snapshot.branches[b"trunk"] | |||||||||||||
bzr_head = self._get_extids_for_targets([head.target])[0].extid | |||||||||||||
self._latest_head = BzrRevisionId(bzr_head) | |||||||||||||
def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: | |||||||||||||
"""Get all Bzr ExtIDs for the targets in the latest snapshot""" | |||||||||||||
extids = [] | |||||||||||||
for extid in self.storage.extid_get_from_target( | |||||||||||||
swhids.ObjectType.REVISION, | |||||||||||||
targets, | |||||||||||||
extid_type=EXTID_TYPE, | |||||||||||||
extid_version=EXTID_VERSION, | |||||||||||||
): | |||||||||||||
extids.append(extid) | |||||||||||||
self._revision_id_to_sha1git[ | |||||||||||||
BzrRevisionId(extid.extid) | |||||||||||||
] = extid.target.object_id | |||||||||||||
if extids: | |||||||||||||
# Filter out dangling extids, we need to load their target again | |||||||||||||
revisions_missing = self.storage.revision_missing( | |||||||||||||
[extid.target.object_id for extid in extids] | |||||||||||||
) | |||||||||||||
extids = [ | |||||||||||||
extid | |||||||||||||
for extid in extids | |||||||||||||
if extid.target.object_id not in revisions_missing | |||||||||||||
] | |||||||||||||
return extids | |||||||||||||
def cleanup(self) -> None: | def cleanup(self) -> None: | ||||||||||||
if self.repo is not None: | if self.repo is not None: | ||||||||||||
self.repo.unlock() | self.repo.unlock() | ||||||||||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||||||||||
"""Fetch the data from the source the loader is currently loading | """Fetch the data from the source the loader is currently loading | ||||||||||||
Returns: | Returns: | ||||||||||||
▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines | def _get_bzr_revs_to_load(self) -> List[BzrRevision]: | ||||||||||||
if parents is None: | if parents is None: | ||||||||||||
# Filter out ghosts, they scare the `TopoSorter`. | # Filter out ghosts, they scare the `TopoSorter`. | ||||||||||||
# Store them to later catch exceptions about missing parent revision | # Store them to later catch exceptions about missing parent revision | ||||||||||||
self._ghosts.add(rev) | self._ghosts.add(rev) | ||||||||||||
continue | continue | ||||||||||||
ancestry.append((rev, parents)) | ancestry.append((rev, parents)) | ||||||||||||
sorter = tsort.TopoSorter(ancestry) | sorter = tsort.TopoSorter(ancestry) | ||||||||||||
return sorter.sorted() | all_revisions = sorter.sorted() | ||||||||||||
if self._latest_head is not None: | |||||||||||||
# Breezy does not offer a generic querying system, so we do the | |||||||||||||
# filtering ourselves, which is simple enough given that bzr does | |||||||||||||
# not have multiple heads per branch | |||||||||||||
found = False | |||||||||||||
new_revisions = [] | |||||||||||||
# Filter out revisions until we reach the one we've already seen | |||||||||||||
for rev in all_revisions: | |||||||||||||
if not found: | |||||||||||||
if rev == self._latest_head: | |||||||||||||
ardumont: do we need to continue iterating over the list if we found what we were looking for? | |||||||||||||
Done Inline ActionsWe do, since we're only appending to new_revisions after we've found the previous head. Alphare: We do, since we're only appending to `new_revisions` after we've found the previous head. | |||||||||||||
found = True | |||||||||||||
else: | |||||||||||||
new_revisions.append(rev) | |||||||||||||
if not found and all_revisions: | |||||||||||||
# The previously saved head has been uncommitted, reload | |||||||||||||
# everything | |||||||||||||
msg = "Previous head (%s) not found, loading all revisions" | |||||||||||||
self.log.debug(msg, self._latest_head) | |||||||||||||
return all_revisions | |||||||||||||
return new_revisions | |||||||||||||
return all_revisions | |||||||||||||
def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]: | def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]: | ||||||||||||
"""Return an iterator of this revision's ancestors""" | """Return an iterator of this revision's ancestors""" | ||||||||||||
assert self.repo is not None | assert self.repo is not None | ||||||||||||
return self.repo.get_graph().iter_ancestry([rev.revision_id]) | return self.repo.get_graph().iter_ancestry([rev.revision_id]) | ||||||||||||
@lru_cache() | @lru_cache() | ||||||||||||
def _get_revision_tree(self, rev: BzrRevisionId): | def _get_revision_tree(self, rev: BzrRevisionId): | ||||||||||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | def _get_revision_parents(self, bzr_rev: BzrRevision): | ||||||||||||
continue | continue | ||||||||||||
raise | raise | ||||||||||||
parents.append(revision_id) | parents.append(revision_id) | ||||||||||||
return tuple(parents) | return tuple(parents) | ||||||||||||
def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git: | def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git: | ||||||||||||
"""Return the git sha1 of a revision given its bazaar revision id.""" | """Return the git sha1 of a revision given its bazaar revision id.""" | ||||||||||||
return self._revision_id_to_sha1git[bzr_id] | from_cache = self._revision_id_to_sha1git.get(bzr_id) | ||||||||||||
if from_cache is not None: | |||||||||||||
return from_cache | |||||||||||||
# The parent was not loaded in this run, get it from storage | |||||||||||||
from_storage = self.storage.extid_get_from_extid( | |||||||||||||
EXTID_TYPE, ids=[bzr_id], version=EXTID_VERSION | |||||||||||||
Not Done Inline Actionsyou should find a way to batch requests to this endpoint eventually, but it's not a blocker for this diff. vlorentz: you should find a way to batch requests to this endpoint eventually, but it's not a blocker for… | |||||||||||||
Done Inline ActionsI'm not sure what the improvement would be. The endpoint is rarely accessed, only once or twice when doing an incremental run or with broken tags. Maybe I'm missing your point. :) Alphare: I'm not sure what the improvement would be. The endpoint is rarely accessed, only once or twice… | |||||||||||||
) | |||||||||||||
Done Inline Actions
why not this? vlorentz: why not this? | |||||||||||||
Done Inline ActionsI actually have no idea why this is the way it is. I possibly needed to filter something and forgot about it later. Thanks for spotting it Alphare: I actually have no idea why this is the way it is. I possibly needed to filter something and… | |||||||||||||
if len(from_storage) != 1: | |||||||||||||
msg = "Expected 1 match from storage for bzr node %r, got %d" | |||||||||||||
raise LookupError(msg % (bzr_id.hex(), len(from_storage))) | |||||||||||||
return from_storage[0].target.object_id | |||||||||||||
@property | @property | ||||||||||||
def branch(self) -> BzrBranch: | def branch(self) -> BzrBranch: | ||||||||||||
"""Returns the only branch in the current repository. | """Returns the only branch in the current repository. | ||||||||||||
Bazaar branches can be assimilated to repositories in other VCS like | Bazaar branches can be assimilated to repositories in other VCS like | ||||||||||||
Git or Mercurial. By contrast, a Bazaar repository is just a store of | Git or Mercurial. By contrast, a Bazaar repository is just a store of | ||||||||||||
revisions to optimize disk usage, with no particular semantics.""" | revisions to optimize disk usage, with no particular semantics.""" | ||||||||||||
Show All 25 Lines |
do we need to continue iterating over the list if we found what we were looking for?