Changeset View
Standalone View
swh/loader/cvs/loader.py
# Copyright (C) 2015-2021 The Software Heritage developers | # Copyright (C) 2015-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Loader in charge of injecting either new or existing cvs repositories to | """Loader in charge of injecting either new or existing cvs repositories to | ||||
swh-storage. | swh-storage. | ||||
""" | """ | ||||
from datetime import datetime | from datetime import datetime | ||||
import os | import os | ||||
import os.path | |||||
import subprocess | import subprocess | ||||
import tempfile | import tempfile | ||||
import time | import time | ||||
from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple | from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast | ||||
from urllib3.util import parse_url | from urllib3.util import parse_url | ||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ( | from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ( | ||||
CHANGESET_FUZZ_SEC, | CHANGESET_FUZZ_SEC, | ||||
ChangeSetKey, | ChangeSetKey, | ||||
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines | ): | ||||
self._contents: List[Content] = [] | self._contents: List[Content] = [] | ||||
self._skipped_contents: List[SkippedContent] = [] | self._skipped_contents: List[SkippedContent] = [] | ||||
self._directories: List[Directory] = [] | self._directories: List[Directory] = [] | ||||
self._revisions: List[Revision] = [] | self._revisions: List[Revision] = [] | ||||
# internal state, current visit | # internal state, current visit | ||||
self._last_revision: Optional[Revision] = None | self._last_revision: Optional[Revision] = None | ||||
self._visit_status = "full" | self._visit_status = "full" | ||||
self.visit_date = visit_date | self.visit_date = visit_date | ||||
if not cvsroot_path: | |||||
cvsroot_path = tempfile.mkdtemp( | |||||
suffix="-%s" % os.getpid(), | |||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | |||||
dir=self.temp_directory, | |||||
) | |||||
vlorentz: why move this out of the constructor? | |||||
Not Done Inline Actionsping? vlorentz: ping? | |||||
Done Inline ActionsThe cvsroot path has different semantics based on the access protocol. With rsync it is indeed a temporary directory where a full copy of the cvs repository is located within. Support for rsync was implemented first (without giving much thought to the pserver case) so this variable ended up in the constructor. But in the pserver case we need to initialize this to something else. The cvsroot is just a path on the server in that case, and there is no local copy of the repository. In order to construct paths into the Attic directories correctly, we need to use the server-side cvsroot path, not a path to a local directory somewhere in /tmp. And we should skip the creation of an unused temporary directory in the pserver case. Does this make sense now? stsp: The cvsroot path has different semantics based on the access protocol.
With rsync it is indeed… | |||||
Not Done Inline ActionsI see. Then, instead of cast(str, self.cvsroot_path), which converts *any* type to a string, thereby bypassing type-checking, you should just use assert self.cvsroot_path, which causes mypy to infer the type is str instead of Optional[str]. vlorentz: I see.
Then, instead of `cast(str, self.cvsroot_path)`, which converts *any* type to a string… | |||||
Done Inline ActionsThanks, done. stsp: Thanks, done. | |||||
self.cvsroot_path = cvsroot_path | self.cvsroot_path = cvsroot_path | ||||
self.snapshot: Optional[Snapshot] = None | self.snapshot: Optional[Snapshot] = None | ||||
self.last_snapshot: Optional[Snapshot] = snapshot_get_latest( | self.last_snapshot: Optional[Snapshot] = snapshot_get_latest( | ||||
self.storage, self.origin_url | self.storage, self.origin_url | ||||
) | ) | ||||
def compute_swh_revision( | def compute_swh_revision( | ||||
Show All 17 Lines | ) -> Tuple[Revision, from_disk.Directory]: | ||||
revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | ||||
self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) | self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) | ||||
self._last_revision = revision | self._last_revision = revision | ||||
return (revision, swh_dir) | return (revision, swh_dir) | ||||
def checkout_file_with_rcsparse( | def checkout_file_with_rcsparse( | ||||
self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile | self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile | ||||
) -> None: | ) -> None: | ||||
path = file_path(self.cvsroot_path, f.path) | path = file_path(cast(str, self.cvsroot_path), f.path) | ||||
wtpath = os.path.join(self.worktree_path, path) | wtpath = os.path.join(self.worktree_path, path) | ||||
self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) | self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) | ||||
if f.state == "dead": | if f.state == "dead": | ||||
# remove this file from work tree | # remove this file from work tree | ||||
try: | try: | ||||
os.remove(wtpath) | os.remove(wtpath) | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
pass | pass | ||||
else: | else: | ||||
# create, or update, this file in the work tree | # create, or update, this file in the work tree | ||||
if not rcsfile: | if not rcsfile: | ||||
rcsfile = rcsparse.rcsfile(f.path) | rcsfile = rcsparse.rcsfile(f.path) | ||||
rcs = RcsKeywords() | rcs = RcsKeywords() | ||||
contents = rcs.expand_keyword(f.path, rcsfile, f.rev) | contents = rcs.expand_keyword(f.path, rcsfile, f.rev) | ||||
os.makedirs(os.path.dirname(wtpath), exist_ok=True) | os.makedirs(os.path.dirname(wtpath), exist_ok=True) | ||||
outfile = open(wtpath, mode="wb") | outfile = open(wtpath, mode="wb") | ||||
outfile.write(contents) | outfile.write(contents) | ||||
outfile.close() | outfile.close() | ||||
def checkout_file_with_cvsclient( | def checkout_file_with_cvsclient( | ||||
self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient | self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient | ||||
): | ): | ||||
path = file_path(self.cvsroot_path, f.path) | path = file_path(cast(str, self.cvsroot_path), f.path) | ||||
wtpath = os.path.join(self.worktree_path, path) | wtpath = os.path.join(self.worktree_path, path) | ||||
self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) | self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) | ||||
if f.state == "dead": | if f.state == "dead": | ||||
# remove this file from work tree | # remove this file from work tree | ||||
try: | try: | ||||
os.remove(wtpath) | os.remove(wtpath) | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
pass | pass | ||||
else: | else: | ||||
dirname = os.path.dirname(wtpath) | dirname = os.path.dirname(wtpath) | ||||
os.makedirs(dirname, exist_ok=True) | os.makedirs(dirname, exist_ok=True) | ||||
self.log.debug("checkout to %s\n" % wtpath) | self.log.debug("checkout to %s\n" % wtpath) | ||||
fp = cvsclient.checkout(f.path, f.rev, dirname, expand_keywords=True) | fp = cvsclient.checkout(path, f.rev, dirname, expand_keywords=True) | ||||
os.rename(fp.name, wtpath) | os.rename(fp.name, wtpath) | ||||
try: | try: | ||||
fp.close() | fp.close() | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
# Well, we have just renamed the file... | # Well, we have just renamed the file... | ||||
pass | pass | ||||
def process_cvs_changesets( | def process_cvs_changesets( | ||||
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines | def fetch_cvs_repo_with_rsync(self, host: str, path: str) -> None: | ||||
break | break | ||||
if not have_module: | if not have_module: | ||||
raise NotFound( | raise NotFound( | ||||
"CVS module %s not found at %s" % (self.cvs_module_name, url) | "CVS module %s not found at %s" % (self.cvs_module_name, url) | ||||
) | ) | ||||
if not have_cvsroot: | if not have_cvsroot: | ||||
raise NotFound("No CVSROOT directory found at %s" % url) | raise NotFound("No CVSROOT directory found at %s" % url) | ||||
subprocess.run(["rsync", "-a", url, self.cvsroot_path]).check_returncode() | # mypy complains: List item 3 has incompatible type "Optional[str]"; | ||||
# because self.cvsroot_path is an optional argument. We do however | |||||
# ensure that it is initialized if the loader is not passed a | |||||
Not Done Inline ActionsI don't think telling mypy to ignore the type error is the right solution (see my other comment) vlorentz: I don't think telling mypy to ignore the type error is the right solution (see my other comment) | |||||
Done Inline ActionsAgreed. Thanks for taking the time to help me figure out what needs to be done to get it work right. stsp: Agreed. Thanks for taking the time to help me figure out what needs to be done to get it work… | |||||
# corresponding argument. Better ideas than ignoring types on this line? | |||||
subprocess.run( | |||||
["rsync", "-a", url, self.cvsroot_path] # type: ignore | |||||
).check_returncode() | |||||
def prepare(self) -> None: | def prepare(self) -> None: | ||||
self._last_revision = None | self._last_revision = None | ||||
self.worktree_path = tempfile.mkdtemp( | self.worktree_path = tempfile.mkdtemp( | ||||
suffix="-%s" % os.getpid(), | suffix="-%s" % os.getpid(), | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
dir=self.temp_directory, | dir=self.temp_directory, | ||||
) | ) | ||||
url = parse_url(self.origin_url) | url = parse_url(self.origin_url) | ||||
self.log.debug( | self.log.debug( | ||||
"prepare; origin_url=%s scheme=%s path=%s", | "prepare; origin_url=%s scheme=%s path=%s", | ||||
self.origin_url, | self.origin_url, | ||||
url.scheme, | url.scheme, | ||||
url.path, | url.path, | ||||
) | ) | ||||
if not url.path: | if not url.path: | ||||
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | ||||
self.cvs_module_name = os.path.basename(url.path) | self.cvs_module_name = os.path.basename(url.path) | ||||
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name)) | os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name)) | ||||
if url.scheme == "file" or url.scheme == "rsync": | |||||
# local CVS repository conversion | |||||
if not self.cvsroot_path: | |||||
self.cvsroot_path = tempfile.mkdtemp( | |||||
suffix="-%s" % os.getpid(), | |||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | |||||
dir=self.temp_directory, | |||||
) | |||||
if url.scheme == "file": | if url.scheme == "file": | ||||
if not os.path.exists(url.path): | if not os.path.exists(url.path): | ||||
raise NotFound | raise NotFound | ||||
elif url.scheme == "rsync": | elif url.scheme == "rsync": | ||||
self.fetch_cvs_repo_with_rsync(url.host, url.path) | self.fetch_cvs_repo_with_rsync(url.host, url.path) | ||||
if url.scheme == "file" or url.scheme == "rsync": | |||||
# local CVS repository conversion | |||||
have_rcsfile = False | have_rcsfile = False | ||||
have_cvsroot = False | have_cvsroot = False | ||||
for root, dirs, files in os.walk(self.cvsroot_path): | for root, dirs, files in os.walk(self.cvsroot_path): | ||||
if "CVSROOT" in dirs: | if "CVSROOT" in dirs: | ||||
have_cvsroot = True | have_cvsroot = True | ||||
dirs.remove("CVSROOT") | dirs.remove("CVSROOT") | ||||
continue | continue | ||||
for f in files: | for f in files: | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | def prepare(self) -> None: | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
len(cvs_changesets), | len(cvs_changesets), | ||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_changesets( | self.swh_revision_gen = self.process_cvs_changesets( | ||||
cvs_changesets, use_rcsparse=True | cvs_changesets, use_rcsparse=True | ||||
) | ) | ||||
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | ||||
# remote CVS repository conversion | # remote CVS repository conversion | ||||
if not self.cvsroot_path: | |||||
self.cvsroot_path = os.path.dirname(url.path) | |||||
self.cvsclient = CVSClient(url) | self.cvsclient = CVSClient(url) | ||||
cvsroot_path = os.path.dirname(url.path) | cvsroot_path = os.path.dirname(url.path) | ||||
self.log.info( | self.log.info( | ||||
"Fetching CVS rlog from %s:%s/%s", | "Fetching CVS rlog from %s:%s/%s", | ||||
url.host, | url.host, | ||||
cvsroot_path, | cvsroot_path, | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
) | ) | ||||
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | ||||
self.rlog_file = self.cvsclient.fetch_rlog() | main_rlog_file = self.cvsclient.fetch_rlog() | ||||
self.rlog.parse_rlog(self.rlog_file) | self.rlog.parse_rlog(main_rlog_file) | ||||
# Find file deletion events only visible in Attic directories. | |||||
main_changesets = self.rlog.changesets | |||||
attic_paths = [] | |||||
attic_rlog_files = [] | |||||
for k in main_changesets: | |||||
Not Done Inline Actionscan you use longer variable names? vlorentz: can you use longer variable names? | |||||
for changed_file in k.revs: | |||||
path = file_path(cast(str, self.cvsroot_path), changed_file.path) | |||||
Not Done Inline Actionshard to tell, there are many implicitly typed variables here. You can use reveal_type(xxx) to make mypy print what type it inferred for xxx (no need to import it, it's only used while type-checking) vlorentz: hard to tell, there are many implicitly typed variables here.
You can use `reveal_type(xxx)`… | |||||
Not Done Inline Actionstry rebasing on top of D6598, I added a bunch of type annotations that should help mypy vlorentz: try rebasing on top of D6598, I added a bunch of type annotations that should help mypy | |||||
Done Inline ActionsThanks, I have tried this by running 'arc patch D6598' and creating a temporary branch based on the arcpatch-D6598 branch. Then I used git cherry-pick to apply my oustanding patches in sequence. The result is that mypy still complains as follows: swh/loader/cvs/loader.py:143: error: Argument 1 to "file_path" has incompatible type "Optional[str]"; expected "str" I know how to fix the one on swh/loader/cvs/loader.py:222 (declare rlog_file member as BinaryIO) but the rest elude me... it seems like a tempfile.TemporaryFile() object cannot be used with anything that requires BinaryIO, is it? stsp: Thanks, I have tried this by running 'arc patch D6598' and creating a temporary branch based on… | |||||
Not Done Inline ActionsIt can, depending on the value of the parameter (t vs b). Unfortunately, mypy does not support dependent types, so it can't know that. Use https://docs.python.org/3/library/typing.html#typing.cast vlorentz: It can, depending on the value of the parameter (`t` vs `b`).
Unfortunately, mypy does not… | |||||
Done Inline ActionsFine, I could fix all these warnings with explicit casts. stsp: Fine, I could fix all these warnings with explicit casts. | |||||
if path.startswith(cast(str, self.cvsroot_path)): | |||||
path = path[ | |||||
len( | |||||
os.path.commonpath([cast(str, self.cvsroot_path), path]) | |||||
) | |||||
+ 1 : | |||||
] | |||||
parent_path = os.path.dirname(path) | |||||
if parent_path.split("/")[-1] == "Attic": | |||||
continue | |||||
attic_path = parent_path + "/Attic" | |||||
if attic_path in attic_paths: | |||||
continue | |||||
attic_paths.append(attic_path) # avoid multiple visits | |||||
# Try to fetch more rlog data from this Attic directory. | |||||
attic_rlog_file = self.cvsclient.fetch_rlog( | |||||
path=attic_path, state="dead", | |||||
) | |||||
if attic_rlog_file: | |||||
attic_rlog_files.append(attic_rlog_file) | |||||
if len(attic_rlog_files) == 0: | |||||
self.rlog_file = main_rlog_file | |||||
else: | |||||
# Combine all the rlog pieces we found and re-parse. | |||||
fp = tempfile.TemporaryFile() | |||||
for attic_rlog_file in attic_rlog_files: | |||||
for line in attic_rlog_file.readlines(): | |||||
fp.write(line) | |||||
attic_rlog_file.close() | |||||
main_rlog_file.seek(0) | |||||
for line in main_rlog_file.readlines(): | |||||
fp.write(line) | |||||
main_rlog_file.close() | |||||
fp.seek(0) | |||||
self.rlog.parse_rlog(cast(BinaryIO, fp)) | |||||
self.rlog_file = cast(BinaryIO, fp) | |||||
cvs_changesets = sorted(self.rlog.changesets) | cvs_changesets = sorted(self.rlog.changesets) | ||||
self.log.info( | self.log.info( | ||||
"CVS changesets found for %s: %d", | "CVS changesets found for %s: %d", | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
len(cvs_changesets), | len(cvs_changesets), | ||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_changesets( | self.swh_revision_gen = self.process_cvs_changesets( | ||||
cvs_changesets, use_rcsparse=False | cvs_changesets, use_rcsparse=False | ||||
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines |
why move this out of the constructor?