Differential D5371 Diff 19266 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

# Copyright (C) 2020-2021 The Software Heritage developers		# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from collections import deque		from collections import deque
from datetime import datetime		from datetime import datetime
import os		import os
from shutil import rmtree		from shutil import rmtree
from tempfile import mkdtemp		from tempfile import mkdtemp
from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union		from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union

from swh.loader.core.loader import BaseLoader		from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders		from swh.loader.core.utils import clean_dangling_folders
from swh.loader.mercurial.utils import parse_visit_date		from swh.loader.mercurial.utils import parse_visit_date
		from swh.model import identifiers
from swh.model.from_disk import Content, DentryPerms, Directory		from swh.model.from_disk import Content, DentryPerms, Directory
from swh.model.hashutil import hash_to_bytehex, hash_to_bytes		from swh.model.hashutil import hash_to_bytehex, hash_to_bytes
from swh.model.model import (		from swh.model.model import (
		ExtID,
ObjectType,		ObjectType,
Origin,		Origin,
Person,		Person,
Release,		Release,
Revision,		Revision,
RevisionType,		RevisionType,
Sha1Git,		Sha1Git,
Snapshot,		Snapshot,
Show All 12 Lines
FLAG_PERMS = {		FLAG_PERMS = {
b"l": DentryPerms.symlink,		b"l": DentryPerms.symlink,
b"x": DentryPerms.executable_content,		b"x": DentryPerms.executable_content,
b"": DentryPerms.content,		b"": DentryPerms.content,
} # type: Dict[bytes, DentryPerms]		} # type: Dict[bytes, DentryPerms]

TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"		TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"

		EXTID_TYPE = "hg-nodeid"


T = TypeVar("T")		T = TypeVar("T")


class CorruptedRevision(ValueError):		class CorruptedRevision(ValueError):
"""Raised when a revision is corrupted."""		"""Raised when a revision is corrupted."""

def __init__(self, hg_nodeid: HgNodeId) -> None:		def __init__(self, hg_nodeid: HgNodeId) -> None:
▲ Show 20 Lines • Show All 138 Lines • ▼ Show 20 Lines	def prepare(self) -> None:
the loader.		the loader.

"""		"""
# Set here to allow multiple calls to load on the same loader instance		# Set here to allow multiple calls to load on the same loader instance
self._latest_heads = []		self._latest_heads = []

latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)		latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
if latest_snapshot:		if latest_snapshot:
		self._set_latest_heads(latest_snapshot)

		def _set_latest_heads(self, latest_snapshot: Snapshot) -> None:
		"""
		Looks up the nodeid for all revisions in the snapshot, and adds them to
		self._latest_heads.

		This works in two steps:

		1. Query the revisions with extid_get_from_target, to find nodeids from
		revision ids, using the new ExtID architecture
		2. For all revisions that were not found this way, fetch the revision
		and look for the nodeid in its metadata.

		This is a temporary process. When we are done migrating away from revision
		metadata, step 2 will be removed.
		"""
# TODO: add support for releases		# TODO: add support for releases
snapshot_branches = [		snapshot_branches = [
branch.target		branch.target
for branch in latest_snapshot.branches.values()		for branch in latest_snapshot.branches.values()
if branch.target_type == TargetType.REVISION		if branch.target_type == TargetType.REVISION
]		]

self._latest_heads = [		# Get all ExtIDs for revisions in the latest snapshot
		extids = self.storage.extid_get_from_target(
		identifiers.ObjectType.REVISION, snapshot_branches
		)

		# Filter out extids not specific to Mercurial
		extids = [extid for extid in extids if extid.extid_type == EXTID_TYPE]

		if extids:
		# Filter out dangling extids, we need to load their target again
		revisions_missing = self.storage.revision_missing(
		[extid.target.object_id for extid in extids]
		)
		AlphareUnsubmitted Not Done Inline Actions While I'm not expecting a great performance difference, why create an intermediate list instead of a generator since it's the only use, right after? Alphare: While I'm not expecting a great performance difference, why create an intermediate list instead…
		vlorentzAuthorUnsubmitted Done Inline Actions It's used twice. (And I prefer to avoid generators because it makes code harder to debug) vlorentz: It's used twice. (And I prefer to avoid generators because it makes code harder to debug)
		extids = [
		extid
		for extid in extids
		if extid.target.object_id not in revisions_missing
		]

		# Add the found nodeids to self.latest_heads
		self._latest_heads.extend(extid.extid for extid in extids)

		# For each revision without a nodeid, get the revision metadata
		# to see if it is found there.
		found_revisions = {extid.target.object_id for extid in extids if extid}
		revisions_without_extid = list(set(snapshot_branches) - found_revisions)

		self._latest_heads.extend(
hash_to_bytes(revision.metadata["node"])		hash_to_bytes(revision.metadata["node"])
for revision in self.storage.revision_get(snapshot_branches)		for revision in self.storage.revision_get(revisions_without_extid)
if revision and revision.metadata		if revision and revision.metadata
]		)

def fetch_data(self) -> bool:		def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading		"""Fetch the data from the source the loader is currently loading

Returns:		Returns:
a value that is interpreted as a boolean. If True, fetch_data needs		a value that is interpreted as a boolean. If True, fetch_data needs
to be called again to complete loading.		to be called again to complete loading.

▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	def store_data(self):
}		}
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()		tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {		tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {
hg_nodeid: name for name, hg_nodeid in tags_by_name.items()		hg_nodeid: name for name, hg_nodeid in tags_by_name.items()
}		}

snapshot_branches: Dict[bytes, SnapshotBranch] = {}		snapshot_branches: Dict[bytes, SnapshotBranch] = {}

		extids = []

for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items():		for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items():
tag_name = tags_by_hg_nodeid.get(hg_nodeid)		tag_name = tags_by_hg_nodeid.get(hg_nodeid)

# tip is listed in the tags by the mercurial api		# tip is listed in the tags by the mercurial api
# but its not a tag defined by the user in `.hgtags`		# but its not a tag defined by the user in `.hgtags`
if tag_name and tag_name != b"tip":		if tag_name and tag_name != b"tip":
snapshot_branches[tag_name] = SnapshotBranch(		snapshot_branches[tag_name] = SnapshotBranch(
target=self.store_release(tag_name, revision_swhid),		target=self.store_release(tag_name, revision_swhid),
target_type=TargetType.RELEASE,		target_type=TargetType.RELEASE,
)		)

if hg_nodeid in branch_by_hg_nodeid:		if hg_nodeid in branch_by_hg_nodeid:
name = branch_by_hg_nodeid[hg_nodeid]		name = branch_by_hg_nodeid[hg_nodeid]
snapshot_branches[name] = SnapshotBranch(		snapshot_branches[name] = SnapshotBranch(
target=revision_swhid, target_type=TargetType.REVISION,		target=revision_swhid, target_type=TargetType.REVISION,
)		)

# The tip is mapped to `HEAD` to match		# The tip is mapped to `HEAD` to match
# the historical implementation		# the historical implementation
if hg_nodeid == tags_by_name[b"tip"]:		if hg_nodeid == tags_by_name[b"tip"]:
snapshot_branches[b"HEAD"] = SnapshotBranch(		snapshot_branches[b"HEAD"] = SnapshotBranch(
target=name, target_type=TargetType.ALIAS,		target=name, target_type=TargetType.ALIAS,
)		)

		# TODO: do not write an ExtID if we got this branch from an ExtID that
		# already exists.
		# When we are done migrating away from revision metadata, this will
		# be as simple as checking if the target is in self._latest_heads
		extids.append(
		ExtID(
		extid_type=EXTID_TYPE,
		extid=hg_nodeid,
		target=identifiers.CoreSWHID(
		object_type=identifiers.ObjectType.REVISION,
		object_id=revision_swhid,
		),
		)
		)

snapshot = Snapshot(branches=snapshot_branches)		snapshot = Snapshot(branches=snapshot_branches)
self.storage.snapshot_add([snapshot])		self.storage.snapshot_add([snapshot])

		self.storage.extid_add(extids)

self.flush()		self.flush()
self.loaded_snapshot_id = snapshot.id		self.loaded_snapshot_id = snapshot.id

def load_status(self) -> Dict[str, str]:		def load_status(self) -> Dict[str, str]:
"""Detailed loading status.		"""Detailed loading status.

Defaults to logging an eventful load.		Defaults to logging an eventful load.

▲ Show 20 Lines • Show All 283 Lines • Show Last 20 Lines