Differential D6295 Diff 22943 swh/loader/cvs/loader.py

Changeset View

Standalone View

swh/loader/cvs/loader.py

Show First 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	def compute_swh_revision(self, k, logmsg):
"""		"""
# Compute SWH revision from the on-disk state		# Compute SWH revision from the on-disk state
swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))		swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
if self._last_revision:		if self._last_revision:
parents = (self._last_revision.id,)		parents = (self._last_revision.id,)
else:		else:
parents = ()		parents = ()
revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)		revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))		self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id))
self._last_revision = revision		self._last_revision = revision
return (revision, swh_dir)		return (revision, swh_dir)

def process_cvs_changesets(		def process_cvs_changesets(
self, cvs_changesets,		self, cvs_changesets,
) -> Iterator[		) -> Iterator[
Tuple[List[Content], List[SkippedContent], List[Directory], Revision]		Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
]:		]:
Show All 12 Lines	]:
"changeset from %s by %s on branch %s", tstr, k.author, k.branch		"changeset from %s by %s on branch %s", tstr, k.author, k.branch
)		)
logmsg = ""		logmsg = ""
# Check out the on-disk state of this revision		# Check out the on-disk state of this revision
for f in k.revs:		for f in k.revs:
rcsfile = None		rcsfile = None
path = file_path(self.cvsroot_path, f.path)		path = file_path(self.cvsroot_path, f.path)
wtpath = os.path.join(self.worktree_path, path)		wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s of file %s" % (f.rev, f.path))		self.log.info("rev %s of file %s", f.rev, f.path)
if not logmsg:		if not logmsg:
rcsfile = rcsparse.rcsfile(f.path)		rcsfile = rcsparse.rcsfile(f.path)
logmsg = rcsfile.getlog(k.revs[0].rev)		logmsg = rcsfile.getlog(k.revs[0].rev)
if f.state == "dead":		if f.state == "dead":
# remove this file from work tree		# remove this file from work tree
try:		try:
os.remove(wtpath)		os.remove(wtpath)
except FileNotFoundError:		except FileNotFoundError:
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	]:
self.log.info(		self.log.info(
"changeset from %s by %s on branch %s", tstr, k.author, k.branch		"changeset from %s by %s on branch %s", tstr, k.author, k.branch
)		)
logmsg = ""		logmsg = ""
# Check out the on-disk state of this revision		# Check out the on-disk state of this revision
for f in k.revs:		for f in k.revs:
path = file_path(self.cvsroot_path, f.path)		path = file_path(self.cvsroot_path, f.path)
wtpath = os.path.join(self.worktree_path, path)		wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s of file %s" % (f.rev, f.path))		self.log.info("rev %s of file %s", f.rev, f.path)
if not logmsg:		if not logmsg:
logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev)		logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev)
self.log.debug("f.state is %s\n" % f.state)		self.log.debug("f.state is %s", f.state)
if f.state == "dead":		if f.state == "dead":
# remove this file from work tree		# remove this file from work tree
try:		try:
os.remove(wtpath)		os.remove(wtpath)
except FileNotFoundError:		except FileNotFoundError:
pass		pass
else:		else:
dirname = os.path.dirname(wtpath)		dirname = os.path.dirname(wtpath)
os.makedirs(dirname, exist_ok=True)		os.makedirs(dirname, exist_ok=True)
self.log.debug("checkout to %s\n" % wtpath)		self.log.debug("checkout to %s", wtpath)
assert self.cvsclient # avoid None type error from mypy		assert self.cvsclient # avoid None type error from mypy
fp = self.cvsclient.checkout(f.path, f.rev, dirname)		fp = self.cvsclient.checkout(f.path, f.rev, dirname)
os.rename(fp.name, wtpath)		os.rename(fp.name, wtpath)
try:		try:
fp.close()		fp.close()
except FileNotFoundError:		except FileNotFoundError:
# Well, we have just renamed the file...		# Well, we have just renamed the file...
pass		pass
Show All 27 Lines	class CvsLoader(BaseLoader):
def fetch_cvs_repo_with_rsync(self, host, path):		def fetch_cvs_repo_with_rsync(self, host, path):
# URL must end with a trailing slash in order to get CVSROOT listed		# URL must end with a trailing slash in order to get CVSROOT listed
url = "rsync://%s%s/" % (host, os.path.dirname(path))		url = "rsync://%s%s/" % (host, os.path.dirname(path))
rsync = subprocess.run(["rsync", url], capture_output=True, encoding="ascii")		rsync = subprocess.run(["rsync", url], capture_output=True, encoding="ascii")
rsync.check_returncode()		rsync.check_returncode()
have_cvsroot = False		have_cvsroot = False
have_module = False		have_module = False
for line in rsync.stdout.split("\n"):		for line in rsync.stdout.split("\n"):
self.log.debug("rsync server: %s" % line)		self.log.debug("rsync server: %s", line)
if line.endswith(" CVSROOT"):		if line.endswith(" CVSROOT"):
have_cvsroot = True		have_cvsroot = True
elif line.endswith(" %s" % self.cvs_module_name):		elif line.endswith(" %s" % self.cvs_module_name):
have_module = True		have_module = True
if have_module and have_cvsroot:		if have_module and have_cvsroot:
break		break
if not have_module:		if not have_module:
raise NotFound(		raise NotFound(
Show All 16 Lines	def prepare(self):
)		)
self.worktree_path = tempfile.mkdtemp(		self.worktree_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),		suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,		prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,		dir=self.temp_directory,
)		)
url = parse_url(self.origin_url)		url = parse_url(self.origin_url)
self.log.debug(		self.log.debug(
"prepare; origin_url=%s scheme=%s path=%s"		"prepare; origin_url=%s scheme=%s path=%s",
% (self.origin_url, url.scheme, url.path)		self.origin_url,
		url.scheme,
		url.path,
)		)
if not url.path:		if not url.path:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)		raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
self.cvs_module_name = os.path.basename(url.path)		self.cvs_module_name = os.path.basename(url.path)
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name))		os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name))
if url.scheme == "file":		if url.scheme == "file":
if not os.path.exists(url.path):		if not os.path.exists(url.path):
raise NotFound		raise NotFound
Show All 14 Lines	def prepare(self):
if f[-2:] == ",v":		if f[-2:] == ",v":
try:		try:
rcsfile = rcsparse.rcsfile(filepath) # noqa: F841		rcsfile = rcsparse.rcsfile(filepath) # noqa: F841
except (Exception):		except (Exception):
raise		raise
else:		else:
self.log.debug(		self.log.debug(
"Looks like we have data to convert; "		"Looks like we have data to convert; "
"found a valid RCS file at %s" % filepath		"found a valid RCS file at %s",
		filepath,
)		)
have_rcsfile = True		have_rcsfile = True
break		break
if have_rcsfile:		if have_rcsfile:
break		break

if not have_rcsfile:		if not have_rcsfile:
raise NotFound(		raise NotFound(
"Directory %s does not contain any valid RCS files %s"		"Directory %s does not contain any valid RCS files %s",
% self.cvsroot_path		self.cvsroot_path,
)		)
if not have_cvsroot:		if not have_cvsroot:
self.log.warn(		self.log.warn(
"The CVS repository at '%s' lacks a CVSROOT directory; "		"The CVS repository at '%s' lacks a CVSROOT directory; "
"we might be ingesting an incomplete copy of the repository"		"we might be ingesting an incomplete copy of the repository",
% self.cvsroot_path		self.cvsroot_path,
)		)

# Unfortunately, there is no way to convert CVS history in an		# Unfortunately, there is no way to convert CVS history in an
# iterative fashion because the data is not indexed by any kind		# iterative fashion because the data is not indexed by any kind
# of changeset ID. We need to walk the history of each and every		# of changeset ID. We need to walk the history of each and every
# RCS file in the repository during every visit, even if no new		# RCS file in the repository during every visit, even if no new
# changes will be added to the SWH archive afterwards.		# changes will be added to the SWH archive afterwards.
# "CVS’s repository is the software equivalent of a telephone book		# "CVS’s repository is the software equivalent of a telephone book
# sorted by telephone number."		# sorted by telephone number."
# https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/		# https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/
#		#
# An implicit assumption made here is that self.cvs_changesets will		# An implicit assumption made here is that self.cvs_changesets will
# fit into memory in its entirety. If it won't fit then the CVS walker		# fit into memory in its entirety. If it won't fit then the CVS walker
# will need to be modified such that it spools the list of changesets		# will need to be modified such that it spools the list of changesets
# to disk instead.		# to disk instead.
cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)		cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)
self.log.info("Walking CVS module %s", self.cvs_module_name)		self.log.info("Walking CVS module %s", self.cvs_module_name)
cvs.walk(self.cvs_module_name)		cvs.walk(self.cvs_module_name)
cvs_changesets = sorted(cvs.changesets)		cvs_changesets = sorted(cvs.changesets)
self.log.info(		self.log.info(
"CVS changesets found in %s: %d"		"CVS changesets found in %s: %d",
% (self.cvs_module_name, len(cvs_changesets))		self.cvs_module_name,
		len(cvs_changesets),
)		)
self.swh_revision_gen = self.process_cvs_changesets(cvs_changesets)		self.swh_revision_gen = self.process_cvs_changesets(cvs_changesets)
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh":		elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh":
# remote CVS repository conversion		# remote CVS repository conversion
self.cvsclient = cvsclient.CVSClient(url)		self.cvsclient = cvsclient.CVSClient(url)
cvsroot_path = os.path.dirname(url.path)		cvsroot_path = os.path.dirname(url.path)
self.log.info(		self.log.info(
"Fetching CVS rlog from %s:%s/%s",		"Fetching CVS rlog from %s:%s/%s",
url.host,		url.host,
cvsroot_path,		cvsroot_path,
self.cvs_module_name,		self.cvs_module_name,
)		)
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)		self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)
self.rlog_file = self.cvsclient.fetch_rlog()		self.rlog_file = self.cvsclient.fetch_rlog()
self.rlog.parse_rlog(self.rlog_file)		self.rlog.parse_rlog(self.rlog_file)
cvs_changesets = sorted(self.rlog.changesets)		cvs_changesets = sorted(self.rlog.changesets)
self.log.info(		self.log.info(
"CVS changesets found for %s: %d"		"CVS changesets found for %s: %d",
% (self.cvs_module_name, len(cvs_changesets))		self.cvs_module_name,
		len(cvs_changesets),
)		)
self.swh_revision_gen = self.process_cvs_rlog_changesets(cvs_changesets)		self.swh_revision_gen = self.process_cvs_rlog_changesets(cvs_changesets)
else:		else:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)		raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)

def fetch_data(self):		def fetch_data(self):
"""Fetch the next CVS revision."""		"""Fetch the next CVS revision."""
try:		try:
data = next(self.swh_revision_gen)		data = next(self.swh_revision_gen)
except StopIteration:		except StopIteration:
return False		return False
except Exception as e:		except Exception:
self.log.exception(e)		self.log.exception("Exception in fetch_data:")
return False # Stopping iteration		return False # Stopping iteration
self._contents, self._skipped_contents, self._directories, rev = data		self._contents, self._skipped_contents, self._directories, rev = data
self._revisions = [rev]		self._revisions = [rev]
return True		return True

def build_swh_revision(		def build_swh_revision(
self, k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes]		self, k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes]
) -> Revision:		) -> Revision:
Show All 37 Lines	def generate_and_load_snapshot(self, revision) -> Snapshot:
"""		"""
snap = Snapshot(		snap = Snapshot(
branches={		branches={
DEFAULT_BRANCH: SnapshotBranch(		DEFAULT_BRANCH: SnapshotBranch(
target=revision.id, target_type=TargetType.REVISION		target=revision.id, target_type=TargetType.REVISION
)		)
}		}
)		)
self.log.debug("snapshot: %s" % snap)		self.log.debug("snapshot: %s", snap)
self.storage.snapshot_add([snap])		self.storage.snapshot_add([snap])
return snap		return snap

def store_data(self):		def store_data(self):
"Add our current CVS changeset to the archive."		"Add our current CVS changeset to the archive."
self.storage.skipped_content_add(self._skipped_contents)		self.storage.skipped_content_add(self._skipped_contents)
self.storage.content_add(self._contents)		self.storage.content_add(self._contents)
self.storage.directory_add(self._directories)		self.storage.directory_add(self._directories)
self.storage.revision_add(self._revisions)		self.storage.revision_add(self._revisions)
self.snapshot = self.generate_and_load_snapshot(self._last_revision)		self.snapshot = self.generate_and_load_snapshot(self._last_revision)
self.log.debug("SWH snapshot ID: %s" % hashutil.hash_to_hex(self.snapshot.id))		self.log.debug("SWH snapshot ID: %s", hashutil.hash_to_hex(self.snapshot.id))
self.flush()		self.flush()
self.loaded_snapshot_id = self.snapshot.id		self.loaded_snapshot_id = self.snapshot.id
self._skipped_contents = []		self._skipped_contents = []
self._contents = []		self._contents = []
self._directories = []		self._directories = []
self._revisions = []		self._revisions = []

def load_status(self):		def load_status(self):
Show All 11 Lines