Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/from_disk.py
# Copyright (C) 2020-2021 The Software Heritage developers | # Copyright (C) 2020-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import deque | from collections import deque | ||||
from datetime import datetime | from datetime import datetime | ||||
import os | import os | ||||
from shutil import rmtree | from shutil import rmtree | ||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union | from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union | ||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.loader.mercurial.utils import parse_visit_date | from swh.loader.mercurial.utils import parse_visit_date | ||||
from swh.model import identifiers | |||||
from swh.model.from_disk import Content, DentryPerms, Directory | from swh.model.from_disk import Content, DentryPerms, Directory | ||||
from swh.model.hashutil import hash_to_bytehex, hash_to_bytes | from swh.model.hashutil import hash_to_bytehex, hash_to_bytes | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
ExtID, | |||||
ObjectType, | ObjectType, | ||||
Origin, | Origin, | ||||
Person, | Person, | ||||
Release, | Release, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
Sha1Git, | Sha1Git, | ||||
Snapshot, | Snapshot, | ||||
Show All 12 Lines | |||||
FLAG_PERMS = { | FLAG_PERMS = { | ||||
b"l": DentryPerms.symlink, | b"l": DentryPerms.symlink, | ||||
b"x": DentryPerms.executable_content, | b"x": DentryPerms.executable_content, | ||||
b"": DentryPerms.content, | b"": DentryPerms.content, | ||||
} # type: Dict[bytes, DentryPerms] | } # type: Dict[bytes, DentryPerms] | ||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | ||||
EXTID_TYPE = "hg-nodeid" | |||||
T = TypeVar("T") | T = TypeVar("T") | ||||
class CorruptedRevision(ValueError): | class CorruptedRevision(ValueError): | ||||
"""Raised when a revision is corrupted.""" | """Raised when a revision is corrupted.""" | ||||
def __init__(self, hg_nodeid: HgNodeId) -> None: | def __init__(self, hg_nodeid: HgNodeId) -> None: | ||||
▲ Show 20 Lines • Show All 138 Lines • ▼ Show 20 Lines | def prepare(self) -> None: | ||||
the loader. | the loader. | ||||
""" | """ | ||||
# Set here to allow multiple calls to load on the same loader instance | # Set here to allow multiple calls to load on the same loader instance | ||||
self._latest_heads = [] | self._latest_heads = [] | ||||
latest_snapshot = snapshot_get_latest(self.storage, self.origin_url) | latest_snapshot = snapshot_get_latest(self.storage, self.origin_url) | ||||
if latest_snapshot: | if latest_snapshot: | ||||
self._set_latest_heads(latest_snapshot) | |||||
def _set_latest_heads(self, latest_snapshot: Snapshot) -> None: | |||||
""" | |||||
Looks up the nodeid for all revisions in the snapshot, and adds them to | |||||
self._latest_heads. | |||||
This works in two steps: | |||||
1. Query the revisions with extid_get_from_target, to find nodeids from | |||||
revision ids, using the new ExtID architecture | |||||
2. For all revisions that were not found this way, fetch the revision | |||||
and look for the nodeid in its metadata. | |||||
This is a temporary process. When we are done migrating away from revision | |||||
metadata, step 2 will be removed. | |||||
""" | |||||
# TODO: add support for releases | # TODO: add support for releases | ||||
snapshot_branches = [ | snapshot_branches = [ | ||||
branch.target | branch.target | ||||
for branch in latest_snapshot.branches.values() | for branch in latest_snapshot.branches.values() | ||||
if branch.target_type == TargetType.REVISION | if branch.target_type == TargetType.REVISION | ||||
] | ] | ||||
self._latest_heads = [ | # Get all ExtIDs for revisions in the latest snapshot | ||||
extids = self.storage.extid_get_from_target( | |||||
identifiers.ObjectType.REVISION, snapshot_branches | |||||
) | |||||
# Filter out extids not specific to Mercurial | |||||
extids = [extid for extid in extids if extid.extid_type == EXTID_TYPE] | |||||
if extids: | |||||
# Filter out dangling extids, we need to load their target again | |||||
revisions_missing = self.storage.revision_missing( | |||||
[extid.target.object_id for extid in extids] | |||||
) | |||||
Alphare: While I'm not expecting a great performance difference, why create an intermediate list instead… | |||||
Done Inline ActionsIt's used twice. (And I prefer to avoid generators because it makes code harder to debug) vlorentz: It's used twice. (And I prefer to avoid generators because it makes code harder to debug) | |||||
extids = [ | |||||
extid | |||||
for extid in extids | |||||
if extid.target.object_id not in revisions_missing | |||||
] | |||||
# Add the found nodeids to self.latest_heads | |||||
self._latest_heads.extend(extid.extid for extid in extids) | |||||
# For each revision without a nodeid, get the revision metadata | |||||
# to see if it is found there. | |||||
found_revisions = {extid.target.object_id for extid in extids if extid} | |||||
revisions_without_extid = list(set(snapshot_branches) - found_revisions) | |||||
self._latest_heads.extend( | |||||
hash_to_bytes(revision.metadata["node"]) | hash_to_bytes(revision.metadata["node"]) | ||||
for revision in self.storage.revision_get(snapshot_branches) | for revision in self.storage.revision_get(revisions_without_extid) | ||||
if revision and revision.metadata | if revision and revision.metadata | ||||
] | ) | ||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
"""Fetch the data from the source the loader is currently loading | """Fetch the data from the source the loader is currently loading | ||||
Returns: | Returns: | ||||
a value that is interpreted as a boolean. If True, fetch_data needs | a value that is interpreted as a boolean. If True, fetch_data needs | ||||
to be called again to complete loading. | to be called again to complete loading. | ||||
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | def store_data(self): | ||||
} | } | ||||
tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | ||||
tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { | tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { | ||||
hg_nodeid: name for name, hg_nodeid in tags_by_name.items() | hg_nodeid: name for name, hg_nodeid in tags_by_name.items() | ||||
} | } | ||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, SnapshotBranch] = {} | ||||
extids = [] | |||||
for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): | for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): | ||||
tag_name = tags_by_hg_nodeid.get(hg_nodeid) | tag_name = tags_by_hg_nodeid.get(hg_nodeid) | ||||
# tip is listed in the tags by the mercurial api | # tip is listed in the tags by the mercurial api | ||||
# but its not a tag defined by the user in `.hgtags` | # but its not a tag defined by the user in `.hgtags` | ||||
if tag_name and tag_name != b"tip": | if tag_name and tag_name != b"tip": | ||||
snapshot_branches[tag_name] = SnapshotBranch( | snapshot_branches[tag_name] = SnapshotBranch( | ||||
target=self.store_release(tag_name, revision_swhid), | target=self.store_release(tag_name, revision_swhid), | ||||
target_type=TargetType.RELEASE, | target_type=TargetType.RELEASE, | ||||
) | ) | ||||
if hg_nodeid in branch_by_hg_nodeid: | if hg_nodeid in branch_by_hg_nodeid: | ||||
name = branch_by_hg_nodeid[hg_nodeid] | name = branch_by_hg_nodeid[hg_nodeid] | ||||
snapshot_branches[name] = SnapshotBranch( | snapshot_branches[name] = SnapshotBranch( | ||||
target=revision_swhid, target_type=TargetType.REVISION, | target=revision_swhid, target_type=TargetType.REVISION, | ||||
) | ) | ||||
# The tip is mapped to `HEAD` to match | # The tip is mapped to `HEAD` to match | ||||
# the historical implementation | # the historical implementation | ||||
if hg_nodeid == tags_by_name[b"tip"]: | if hg_nodeid == tags_by_name[b"tip"]: | ||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[b"HEAD"] = SnapshotBranch( | ||||
target=name, target_type=TargetType.ALIAS, | target=name, target_type=TargetType.ALIAS, | ||||
) | ) | ||||
# TODO: do not write an ExtID if we got this branch from an ExtID that | |||||
# already exists. | |||||
# When we are done migrating away from revision metadata, this will | |||||
# be as simple as checking if the target is in self._latest_heads | |||||
extids.append( | |||||
ExtID( | |||||
extid_type=EXTID_TYPE, | |||||
extid=hg_nodeid, | |||||
target=identifiers.CoreSWHID( | |||||
object_type=identifiers.ObjectType.REVISION, | |||||
object_id=revision_swhid, | |||||
), | |||||
) | |||||
) | |||||
snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
self.storage.extid_add(extids) | |||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
def load_status(self) -> Dict[str, str]: | def load_status(self) -> Dict[str, str]: | ||||
"""Detailed loading status. | """Detailed loading status. | ||||
Defaults to logging an eventful load. | Defaults to logging an eventful load. | ||||
▲ Show 20 Lines • Show All 283 Lines • Show Last 20 Lines |
While I'm not expecting a great performance difference, why create an intermediate list instead of a generator since it's the only use, right after?