Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/loader.py
- This file was moved from swh/loader/mercurial/from_disk.py.
# Copyright (C) 2020-2021 The Software Heritage developers | # Copyright (C) 2020-2021 The Software Heritage developers | |||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | ||||||||||
"""Loaders for ingesting Mercurial repositories either local from disk, or remote, see | ||||||||||
:class:`swh.loader.mercurial.loader.HgLoader` or from an archive, see | ||||||||||
:class:`swh.loader.mercurial.from_disk.HgArchiveLoader`. | ||||||||||
""" | ||||||||||
from collections import deque | from collections import deque | |||||||||
from datetime import datetime | from datetime import datetime | |||||||||
import os | import os | |||||||||
from shutil import rmtree | from shutil import rmtree | |||||||||
from tempfile import mkdtemp | from tempfile import mkdtemp | |||||||||
from typing import Deque, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union | from typing import Deque, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union | |||||||||
from swh.core.utils import grouper | from swh.core.utils import grouper | |||||||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | |||||||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | |||||||||
from swh.loader.mercurial.utils import get_minimum_env, parse_visit_date | from swh.loader.mercurial.utils import get_minimum_env | |||||||||
from swh.model import identifiers | from swh.model import identifiers | |||||||||
from swh.model.from_disk import Content, DentryPerms, Directory | from swh.model.from_disk import Content, DentryPerms, Directory | |||||||||
from swh.model.hashutil import hash_to_bytehex | from swh.model.hashutil import hash_to_bytehex | |||||||||
from swh.model.model import ( | from swh.model.model import ( | |||||||||
ExtID, | ExtID, | |||||||||
ObjectType, | ObjectType, | |||||||||
Origin, | Origin, | |||||||||
Person, | Person, | |||||||||
Show All 15 Lines | ||||||||||
from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet | from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet | |||||||||
FLAG_PERMS = { | FLAG_PERMS = { | |||||||||
b"l": DentryPerms.symlink, | b"l": DentryPerms.symlink, | |||||||||
b"x": DentryPerms.executable_content, | b"x": DentryPerms.executable_content, | |||||||||
b"": DentryPerms.content, | b"": DentryPerms.content, | |||||||||
} # type: Dict[bytes, DentryPerms] | } # type: Dict[bytes, DentryPerms] | |||||||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.loader" | |||||||||
EXTID_TYPE = "hg-nodeid" | EXTID_TYPE = "hg-nodeid" | |||||||||
EXTID_VERSION: int = 1 | EXTID_VERSION: int = 1 | |||||||||
T = TypeVar("T") | T = TypeVar("T") | |||||||||
class CorruptedRevision(ValueError): | class CorruptedRevision(ValueError): | |||||||||
Show All 39 Lines | class HgDirectory(Directory): | |||||||||
) -> Optional[Union[Content, "HgDirectory", T]]: | ) -> Optional[Union[Content, "HgDirectory", T]]: | |||||||||
# TODO move to swh.model.from_disk.Directory | # TODO move to swh.model.from_disk.Directory | |||||||||
try: | try: | |||||||||
return self[path] | return self[path] | |||||||||
except KeyError: | except KeyError: | |||||||||
return default | return default | |||||||||
class HgLoaderFromDisk(BaseLoader): | class HgLoader(BaseLoader): | |||||||||
"""Load a mercurial repository from a local repository. | """Load a mercurial repository from a local repository. | |||||||||
Mercurial's branching model is more complete than Git's; it allows for multiple | Mercurial's branching model is more complete than Git's; it allows for multiple | |||||||||
heads per branch, closed heads and bookmarks. The following mapping is used to | heads per branch, closed heads and bookmarks. The following mapping is used to | |||||||||
represent the branching state of a Mercurial project in a given snapshot: | represent the branching state of a Mercurial project in a given snapshot: | |||||||||
- `HEAD` (optional) either the node pointed by the `@` bookmark or the tip of | - `HEAD` (optional) either the node pointed by the `@` bookmark or the tip of | |||||||||
the `default` branch | the `default` branch | |||||||||
Show All 21 Lines | class HgLoader(BaseLoader): | |||||||||
visit_type = "hg" | visit_type = "hg" | |||||||||
def __init__( | def __init__( | |||||||||
self, | self, | |||||||||
storage: StorageInterface, | storage: StorageInterface, | |||||||||
url: str, | url: str, | |||||||||
directory: Optional[str] = None, | directory: Optional[str] = None, | |||||||||
logging_class: str = "swh.loader.mercurial.LoaderFromDisk", | logging_class: str = "swh.loader.mercurial.loader.HgLoader", | |||||||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | |||||||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | |||||||||
clone_timeout_seconds: int = 7200, | clone_timeout_seconds: int = 7200, | |||||||||
content_cache_size: int = 10_000, | content_cache_size: int = 10_000, | |||||||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | |||||||||
): | ): | |||||||||
"""Initialize the loader. | """Initialize the loader. | |||||||||
▲ Show 20 Lines • Show All 591 Lines • ▼ Show 20 Lines | def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: | |||||||||
self.storage.directory_add([directory.to_model()]) | self.storage.directory_add([directory.to_model()]) | |||||||||
directories.extend( | directories.extend( | |||||||||
[item for item in directory.values() if isinstance(item, Directory)] | [item for item in directory.values() if isinstance(item, Directory)] | |||||||||
) | ) | |||||||||
return self._last_root.hash | return self._last_root.hash | |||||||||
class HgArchiveLoaderFromDisk(HgLoaderFromDisk): | class HgArchiveLoader(HgLoader): | |||||||||
"""Mercurial loader for repository wrapped within tarballs.""" | """Mercurial loader for repository wrapped within tarballs.""" | |||||||||
def __init__( | def __init__( | |||||||||
self, | self, | |||||||||
storage: StorageInterface, | storage: StorageInterface, | |||||||||
url: str, | url: str, | |||||||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | |||||||||
archive_path: str = None, | archive_path: str = None, | |||||||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | |||||||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | |||||||||
): | ): | |||||||||
super().__init__( | super().__init__( | |||||||||
storage=storage, | storage=storage, | |||||||||
url=url, | url=url, | |||||||||
visit_date=visit_date, | visit_date=visit_date, | |||||||||
logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk", | logging_class="swh.loader.mercurial.loader.ArchiveLoader", | |||||||||
temp_directory=temp_directory, | temp_directory=temp_directory, | |||||||||
max_content_size=max_content_size, | max_content_size=max_content_size, | |||||||||
) | ) | |||||||||
self.archive_extract_temp_dir = None | self.archive_extract_temp_dir = None | |||||||||
self.archive_path = archive_path | self.archive_path = archive_path | |||||||||
def prepare(self): | def prepare(self): | |||||||||
"""Extract the archive instead of cloning.""" | """Extract the archive instead of cloning.""" | |||||||||
self.archive_extract_temp_dir = tmp_extract( | self.archive_extract_temp_dir = tmp_extract( | |||||||||
archive=self.archive_path, | archive=self.archive_path, | |||||||||
dir=self._temp_directory, | dir=self._temp_directory, | |||||||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | |||||||||
suffix=f".dump-{os.getpid()}", | suffix=f".dump-{os.getpid()}", | |||||||||
log=self.log, | log=self.log, | |||||||||
source=self.origin_url, | source=self.origin_url, | |||||||||
) | ) | |||||||||
repo_name = os.listdir(self.temp_dir)[0] | repo_name = os.listdir(self.temp_dir)[0] | |||||||||
self.directory = os.path.join(self.archive_extract_temp_dir, repo_name) | self.directory = os.path.join(self.archive_extract_temp_dir, repo_name) | |||||||||
super().prepare() | super().prepare() | |||||||||
# Allow direct usage of the loader from the command line with | ||||||||||
# `python -m swh.loader.mercurial.from_disk $ORIGIN_URL` | ||||||||||
if __name__ == "__main__": | ||||||||||
import logging | ||||||||||
import click | ||||||||||
logging.basicConfig( | ||||||||||
level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" | ||||||||||
) | ||||||||||
@click.command() | ||||||||||
@click.option("--origin-url", help="origin url") | ||||||||||
@click.option("--hg-directory", help="Path to mercurial repository to load") | ||||||||||
@click.option("--visit-date", default=None, help="Visit date") | ||||||||||
def main(origin_url, hg_directory, visit_date): | ||||||||||
from swh.storage import get_storage | ||||||||||
storage = get_storage(cls="memory") | ||||||||||
return HgLoaderFromDisk( | ||||||||||
storage, | ||||||||||
origin_url, | ||||||||||
directory=hg_directory, | ||||||||||
visit_date=parse_visit_date(visit_date), | ||||||||||
).load() | ||||||||||
main() |