Differential D6593 Diff 24027 swh/loader/cvs/loader.py

Changeset View

Standalone View

swh/loader/cvs/loader.py

# Copyright (C) 2015-2021 The Software Heritage developers		# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version		# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

"""Loader in charge of injecting either new or existing cvs repositories to		"""Loader in charge of injecting either new or existing cvs repositories to
swh-storage.		swh-storage.

"""		"""
from datetime import datetime		from datetime import datetime
import os		import os
		import os.path
import subprocess		import subprocess
import tempfile		import tempfile
import time		import time
from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple		from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast

from urllib3.util import parse_url		from urllib3.util import parse_url

from swh.loader.core.loader import BaseLoader		from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders		from swh.loader.core.utils import clean_dangling_folders
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import (		from swh.loader.cvs.cvs2gitdump.cvs2gitdump import (
CHANGESET_FUZZ_SEC,		CHANGESET_FUZZ_SEC,
ChangeSetKey,		ChangeSetKey,
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	):
self._contents: List[Content] = []		self._contents: List[Content] = []
self._skipped_contents: List[SkippedContent] = []		self._skipped_contents: List[SkippedContent] = []
self._directories: List[Directory] = []		self._directories: List[Directory] = []
self._revisions: List[Revision] = []		self._revisions: List[Revision] = []
# internal state, current visit		# internal state, current visit
self._last_revision: Optional[Revision] = None		self._last_revision: Optional[Revision] = None
self._visit_status = "full"		self._visit_status = "full"
self.visit_date = visit_date		self.visit_date = visit_date

if not cvsroot_path:
cvsroot_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,
)
vlorentzUnsubmitted Not Done Inline Actions why move this out of the constructor? vlorentz: why move this out of the constructor?
vlorentzUnsubmitted Not Done Inline Actions ping? vlorentz: ping?
stspAuthorUnsubmitted Done Inline Actions The cvsroot path has different semantics based on the access protocol. With rsync it is indeed a temporary directory where a full copy of the cvs repository is located within. Support for rsync was implemented first (without giving much thought to the pserver case) so this variable ended up in the constructor. But in the pserver case we need to initialize this to something else. The cvsroot is just a path on the server in that case, and there is no local copy of the repository. In order to construct paths into the Attic directories correctly, we need to use the server-side cvsroot path, not a path to a local directory somewhere in /tmp. And we should skip the creation of an unused temporary directory in the pserver case. Does this make sense now? stsp: The cvsroot path has different semantics based on the access protocol. With rsync it is indeed…
vlorentzUnsubmitted Not Done Inline Actions I see. Then, instead of `cast(str, self.cvsroot_path)`, which converts any type to a string, thereby bypassing type-checking, you should just use `assert self.cvsroot_path`, which causes mypy to infer the type is `str` instead of `Optional[str]`. vlorentz: I see. Then, instead of `cast(str, self.cvsroot_path)`, which converts any type to a string…
stspAuthorUnsubmitted Done Inline Actions Thanks, done. stsp: Thanks, done.
self.cvsroot_path = cvsroot_path		self.cvsroot_path = cvsroot_path

self.snapshot: Optional[Snapshot] = None		self.snapshot: Optional[Snapshot] = None
self.last_snapshot: Optional[Snapshot] = snapshot_get_latest(		self.last_snapshot: Optional[Snapshot] = snapshot_get_latest(
self.storage, self.origin_url		self.storage, self.origin_url
)		)

def compute_swh_revision(		def compute_swh_revision(
Show All 17 Lines	) -> Tuple[Revision, from_disk.Directory]:
revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)		revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id))		self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id))
self._last_revision = revision		self._last_revision = revision
return (revision, swh_dir)		return (revision, swh_dir)

def checkout_file_with_rcsparse(		def checkout_file_with_rcsparse(
self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile		self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile
) -> None:		) -> None:
path = file_path(self.cvsroot_path, f.path)		path = file_path(cast(str, self.cvsroot_path), f.path)
wtpath = os.path.join(self.worktree_path, path)		wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path))		self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path))
if f.state == "dead":		if f.state == "dead":
# remove this file from work tree		# remove this file from work tree
try:		try:
os.remove(wtpath)		os.remove(wtpath)
except FileNotFoundError:		except FileNotFoundError:
pass		pass
else:		else:
# create, or update, this file in the work tree		# create, or update, this file in the work tree
if not rcsfile:		if not rcsfile:
rcsfile = rcsparse.rcsfile(f.path)		rcsfile = rcsparse.rcsfile(f.path)
rcs = RcsKeywords()		rcs = RcsKeywords()
contents = rcs.expand_keyword(f.path, rcsfile, f.rev)		contents = rcs.expand_keyword(f.path, rcsfile, f.rev)
os.makedirs(os.path.dirname(wtpath), exist_ok=True)		os.makedirs(os.path.dirname(wtpath), exist_ok=True)
outfile = open(wtpath, mode="wb")		outfile = open(wtpath, mode="wb")
outfile.write(contents)		outfile.write(contents)
outfile.close()		outfile.close()

def checkout_file_with_cvsclient(		def checkout_file_with_cvsclient(
self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient		self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient
):		):
path = file_path(self.cvsroot_path, f.path)		path = file_path(cast(str, self.cvsroot_path), f.path)
wtpath = os.path.join(self.worktree_path, path)		wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path))		self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path))
if f.state == "dead":		if f.state == "dead":
# remove this file from work tree		# remove this file from work tree
try:		try:
os.remove(wtpath)		os.remove(wtpath)
except FileNotFoundError:		except FileNotFoundError:
pass		pass
else:		else:
dirname = os.path.dirname(wtpath)		dirname = os.path.dirname(wtpath)
os.makedirs(dirname, exist_ok=True)		os.makedirs(dirname, exist_ok=True)
self.log.debug("checkout to %s\n" % wtpath)		self.log.debug("checkout to %s\n" % wtpath)
fp = cvsclient.checkout(f.path, f.rev, dirname, expand_keywords=True)		fp = cvsclient.checkout(path, f.rev, dirname, expand_keywords=True)
os.rename(fp.name, wtpath)		os.rename(fp.name, wtpath)
try:		try:
fp.close()		fp.close()
except FileNotFoundError:		except FileNotFoundError:
# Well, we have just renamed the file...		# Well, we have just renamed the file...
pass		pass

def process_cvs_changesets(		def process_cvs_changesets(
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	def fetch_cvs_repo_with_rsync(self, host: str, path: str) -> None:
break		break
if not have_module:		if not have_module:
raise NotFound(		raise NotFound(
"CVS module %s not found at %s" % (self.cvs_module_name, url)		"CVS module %s not found at %s" % (self.cvs_module_name, url)
)		)
if not have_cvsroot:		if not have_cvsroot:
raise NotFound("No CVSROOT directory found at %s" % url)		raise NotFound("No CVSROOT directory found at %s" % url)

subprocess.run(["rsync", "-a", url, self.cvsroot_path]).check_returncode()		# mypy complains: List item 3 has incompatible type "Optional[str]";
		# because self.cvsroot_path is an optional argument. We do however
		# ensure that it is initialized if the loader is not passed a
		vlorentzUnsubmitted Not Done Inline Actions I don't think telling mypy to ignore the type error is the right solution (see my other comment) vlorentz: I don't think telling mypy to ignore the type error is the right solution (see my other comment)
		stspAuthorUnsubmitted Done Inline Actions Agreed. Thanks for taking the time to help me figure out what needs to be done to get it work right. stsp: Agreed. Thanks for taking the time to help me figure out what needs to be done to get it work…
		# corresponding argument. Better ideas than ignoring types on this line?
		subprocess.run(
		["rsync", "-a", url, self.cvsroot_path] # type: ignore
		).check_returncode()

def prepare(self) -> None:		def prepare(self) -> None:
self._last_revision = None		self._last_revision = None
self.worktree_path = tempfile.mkdtemp(		self.worktree_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),		suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,		prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,		dir=self.temp_directory,
)		)
url = parse_url(self.origin_url)		url = parse_url(self.origin_url)
self.log.debug(		self.log.debug(
"prepare; origin_url=%s scheme=%s path=%s",		"prepare; origin_url=%s scheme=%s path=%s",
self.origin_url,		self.origin_url,
url.scheme,		url.scheme,
url.path,		url.path,
)		)
if not url.path:		if not url.path:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)		raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
self.cvs_module_name = os.path.basename(url.path)		self.cvs_module_name = os.path.basename(url.path)
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name))		os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name))
		if url.scheme == "file" or url.scheme == "rsync":
		# local CVS repository conversion
		if not self.cvsroot_path:
		self.cvsroot_path = tempfile.mkdtemp(
		suffix="-%s" % os.getpid(),
		prefix=TEMPORARY_DIR_PREFIX_PATTERN,
		dir=self.temp_directory,
		)
if url.scheme == "file":		if url.scheme == "file":
if not os.path.exists(url.path):		if not os.path.exists(url.path):
raise NotFound		raise NotFound
elif url.scheme == "rsync":		elif url.scheme == "rsync":
self.fetch_cvs_repo_with_rsync(url.host, url.path)		self.fetch_cvs_repo_with_rsync(url.host, url.path)

if url.scheme == "file" or url.scheme == "rsync":
# local CVS repository conversion
have_rcsfile = False		have_rcsfile = False
have_cvsroot = False		have_cvsroot = False
for root, dirs, files in os.walk(self.cvsroot_path):		for root, dirs, files in os.walk(self.cvsroot_path):
if "CVSROOT" in dirs:		if "CVSROOT" in dirs:
have_cvsroot = True		have_cvsroot = True
dirs.remove("CVSROOT")		dirs.remove("CVSROOT")
continue		continue
for f in files:		for f in files:
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	def prepare(self) -> None:
self.cvs_module_name,		self.cvs_module_name,
len(cvs_changesets),		len(cvs_changesets),
)		)
self.swh_revision_gen = self.process_cvs_changesets(		self.swh_revision_gen = self.process_cvs_changesets(
cvs_changesets, use_rcsparse=True		cvs_changesets, use_rcsparse=True
)		)
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh":		elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh":
# remote CVS repository conversion		# remote CVS repository conversion
		if not self.cvsroot_path:
		self.cvsroot_path = os.path.dirname(url.path)
self.cvsclient = CVSClient(url)		self.cvsclient = CVSClient(url)
cvsroot_path = os.path.dirname(url.path)		cvsroot_path = os.path.dirname(url.path)
self.log.info(		self.log.info(
"Fetching CVS rlog from %s:%s/%s",		"Fetching CVS rlog from %s:%s/%s",
url.host,		url.host,
cvsroot_path,		cvsroot_path,
self.cvs_module_name,		self.cvs_module_name,
)		)
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)		self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)
self.rlog_file = self.cvsclient.fetch_rlog()		main_rlog_file = self.cvsclient.fetch_rlog()
self.rlog.parse_rlog(self.rlog_file)		self.rlog.parse_rlog(main_rlog_file)
		# Find file deletion events only visible in Attic directories.
		main_changesets = self.rlog.changesets
		attic_paths = []
		attic_rlog_files = []
		for k in main_changesets:
		vlorentzUnsubmitted Not Done Inline Actions can you use longer variable names? vlorentz: can you use longer variable names?
		for changed_file in k.revs:
		path = file_path(cast(str, self.cvsroot_path), changed_file.path)
		vlorentzUnsubmitted Not Done Inline Actions hard to tell, there are many implicitly typed variables here. You can use `reveal_type(xxx)` to make mypy print what type it inferred for `xxx` (no need to import it, it's only used while type-checking) vlorentz: hard to tell, there are many implicitly typed variables here. You can use `reveal_type(xxx)`…
		vlorentzUnsubmitted Not Done Inline Actions try rebasing on top of D6598, I added a bunch of type annotations that should help mypy vlorentz: try rebasing on top of D6598, I added a bunch of type annotations that should help mypy
		stspAuthorUnsubmitted Done Inline Actions Thanks, I have tried this by running 'arc patch D6598' and creating a temporary branch based on the arcpatch-D6598 branch. Then I used git cherry-pick to apply my oustanding patches in sequence. The result is that mypy still complains as follows: swh/loader/cvs/loader.py:143: error: Argument 1 to "file_path" has incompatible type "Optional[str]"; expected "str" swh/loader/cvs/loader.py:166: error: Argument 1 to "file_path" has incompatible type "Optional[str]"; expected "str" swh/loader/cvs/loader.py:222: error: Argument 1 to "getlog" of "RlogConv" has incompatible type "IO[bytes]"; expected "BinaryIO" swh/loader/cvs/loader.py:391: error: Incompatible types in assignment (expression has type "FileRevision", variable has type "str") swh/loader/cvs/loader.py:395: error: Argument 1 to "startswith" of "str" has incompatible type "Optional[str]"; expected "Union[str, Tuple[str, ...]]" swh/loader/cvs/loader.py:440: error: Argument 1 to "parse_rlog" of "RlogConv" has incompatible type "IO[bytes]"; expected "BinaryIO" Found 6 errors in 1 file (checked 9 source files) I know how to fix the one on swh/loader/cvs/loader.py:222 (declare rlog_file member as BinaryIO) but the rest elude me... it seems like a tempfile.TemporaryFile() object cannot be used with anything that requires BinaryIO, is it? stsp: Thanks, I have tried this by running 'arc patch D6598' and creating a temporary branch based on…
		vlorentzUnsubmitted Not Done Inline Actions It can, depending on the value of the parameter (`t` vs `b`). Unfortunately, mypy does not support dependent types, so it can't know that. Use https://docs.python.org/3/library/typing.html#typing.cast vlorentz: It can, depending on the value of the parameter (`t` vs `b`). Unfortunately, mypy does not…
		stspAuthorUnsubmitted Done Inline Actions Fine, I could fix all these warnings with explicit casts. stsp: Fine, I could fix all these warnings with explicit casts.
		if path.startswith(cast(str, self.cvsroot_path)):
		path = path[
		len(
		os.path.commonpath([cast(str, self.cvsroot_path), path])
		)
		+ 1 :
		]
		parent_path = os.path.dirname(path)

		if parent_path.split("/")[-1] == "Attic":
		continue
		attic_path = parent_path + "/Attic"
		if attic_path in attic_paths:
		continue
		attic_paths.append(attic_path) # avoid multiple visits
		# Try to fetch more rlog data from this Attic directory.
		attic_rlog_file = self.cvsclient.fetch_rlog(
		path=attic_path, state="dead",
		)
		if attic_rlog_file:
		attic_rlog_files.append(attic_rlog_file)
		if len(attic_rlog_files) == 0:
		self.rlog_file = main_rlog_file
		else:
		# Combine all the rlog pieces we found and re-parse.
		fp = tempfile.TemporaryFile()
		for attic_rlog_file in attic_rlog_files:
		for line in attic_rlog_file.readlines():
		fp.write(line)
		attic_rlog_file.close()
		main_rlog_file.seek(0)
		for line in main_rlog_file.readlines():
		fp.write(line)
		main_rlog_file.close()
		fp.seek(0)
		self.rlog.parse_rlog(cast(BinaryIO, fp))
		self.rlog_file = cast(BinaryIO, fp)
cvs_changesets = sorted(self.rlog.changesets)		cvs_changesets = sorted(self.rlog.changesets)
self.log.info(		self.log.info(
"CVS changesets found for %s: %d",		"CVS changesets found for %s: %d",
self.cvs_module_name,		self.cvs_module_name,
len(cvs_changesets),		len(cvs_changesets),
)		)
self.swh_revision_gen = self.process_cvs_changesets(		self.swh_revision_gen = self.process_cvs_changesets(
cvs_changesets, use_rcsparse=False		cvs_changesets, use_rcsparse=False
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines