Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/cvs/loader.py
# Copyright (C) 2015-2022 The Software Heritage developers | # Copyright (C) 2015-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Loader in charge of injecting either new or existing cvs repositories to | """Loader in charge of injecting either new or existing cvs repositories to | ||||
swh-storage. | swh-storage. | ||||
""" | """ | ||||
from datetime import datetime | from datetime import datetime | ||||
import os | import os | ||||
import os.path | import os.path | ||||
import subprocess | import subprocess | ||||
import tempfile | import tempfile | ||||
import time | import time | ||||
from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast | from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast | ||||
from urllib.parse import urlparse | |||||
import sentry_sdk | import sentry_sdk | ||||
from tenacity import retry | from tenacity import retry | ||||
from tenacity.retry import retry_if_exception_type | from tenacity.retry import retry_if_exception_type | ||||
from tenacity.stop import stop_after_attempt | from tenacity.stop import stop_after_attempt | ||||
from urllib3.util import parse_url | |||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ( | from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ( | ||||
CHANGESET_FUZZ_SEC, | CHANGESET_FUZZ_SEC, | ||||
ChangeSetKey, | ChangeSetKey, | ||||
CvsConv, | CvsConv, | ||||
FileRevision, | FileRevision, | ||||
▲ Show 20 Lines • Show All 351 Lines • ▼ Show 20 Lines | class CvsLoader(BaseLoader): | ||||
def prepare(self) -> None: | def prepare(self) -> None: | ||||
self._last_revision = None | self._last_revision = None | ||||
self.tempdir_path = tempfile.mkdtemp( | self.tempdir_path = tempfile.mkdtemp( | ||||
suffix="-%s" % os.getpid(), | suffix="-%s" % os.getpid(), | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
dir=self.temp_directory, | dir=self.temp_directory, | ||||
) | ) | ||||
url = parse_url(self.origin.url) | url = urlparse(self.origin.url) | ||||
self.log.debug( | self.log.debug( | ||||
"prepare; origin_url=%s scheme=%s path=%s", | "prepare; origin_url=%s scheme=%s path=%s", | ||||
self.origin.url, | self.origin.url, | ||||
url.scheme, | url.scheme, | ||||
url.path, | url.path, | ||||
) | ) | ||||
if not url.path: | if not url.path: | ||||
raise NotFound(f"Invalid CVS origin URL '{self.origin.url}'") | raise NotFound(f"Invalid CVS origin URL '{self.origin.url}'") | ||||
self.cvs_module_name = os.path.basename(url.path) | self.cvs_module_name = os.path.basename(url.path) | ||||
self.server_style_cvsroot = os.path.dirname(url.path) | self.server_style_cvsroot = os.path.dirname(url.path) | ||||
self.worktree_path = os.path.join(self.tempdir_path, self.cvs_module_name) | self.worktree_path = os.path.join(self.tempdir_path, self.cvs_module_name) | ||||
if url.scheme == "file" or url.scheme == "rsync": | if url.scheme == "file" or url.scheme == "rsync": | ||||
# local CVS repository conversion | # local CVS repository conversion | ||||
if not self.cvsroot_path: | if not self.cvsroot_path: | ||||
self.cvsroot_path = tempfile.mkdtemp( | self.cvsroot_path = tempfile.mkdtemp( | ||||
suffix="-%s" % os.getpid(), | suffix="-%s" % os.getpid(), | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
dir=self.temp_directory, | dir=self.temp_directory, | ||||
) | ) | ||||
if url.scheme == "file": | if url.scheme == "file": | ||||
if not os.path.exists(url.path): | if not os.path.exists(url.path): | ||||
raise NotFound | raise NotFound | ||||
elif url.scheme == "rsync": | elif url.scheme == "rsync": | ||||
self.fetch_cvs_repo_with_rsync(url.host, url.path) | assert url.hostname is not None | ||||
self.fetch_cvs_repo_with_rsync(url.hostname, url.path) | |||||
have_rcsfile = False | have_rcsfile = False | ||||
have_cvsroot = False | have_cvsroot = False | ||||
for root, dirs, files in os.walk(self.cvsroot_path): | for root, dirs, files in os.walk(self.cvsroot_path): | ||||
if "CVSROOT" in dirs: | if "CVSROOT" in dirs: | ||||
have_cvsroot = True | have_cvsroot = True | ||||
dirs.remove("CVSROOT") | dirs.remove("CVSROOT") | ||||
continue | continue | ||||
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | def prepare(self) -> None: | ||||
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | ||||
# remote CVS repository conversion | # remote CVS repository conversion | ||||
if not self.cvsroot_path: | if not self.cvsroot_path: | ||||
self.cvsroot_path = os.path.dirname(url.path) | self.cvsroot_path = os.path.dirname(url.path) | ||||
self.cvsclient = CVSClient(url) | self.cvsclient = CVSClient(url) | ||||
cvsroot_path = os.path.dirname(url.path) | cvsroot_path = os.path.dirname(url.path) | ||||
self.log.debug( | self.log.debug( | ||||
"Fetching CVS rlog from %s:%s/%s", | "Fetching CVS rlog from %s:%s/%s", | ||||
url.host, | url.hostname, | ||||
cvsroot_path, | cvsroot_path, | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
) | ) | ||||
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | ||||
main_rlog_file = self.cvsclient.fetch_rlog() | main_rlog_file = self.cvsclient.fetch_rlog() | ||||
self.rlog.parse_rlog(main_rlog_file) | self.rlog.parse_rlog(main_rlog_file) | ||||
# Find file deletion events only visible in Attic directories. | # Find file deletion events only visible in Attic directories. | ||||
main_changesets = self.rlog.changesets | main_changesets = self.rlog.changesets | ||||
▲ Show 20 Lines • Show All 156 Lines • Show Last 20 Lines |