No OneTemporary
Actions

Size

16 KB

Subscribers

None

View Options

	diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py
	index 7eea077..038eec8 100644
	--- a/swh/loader/cvs/loader.py
	+++ b/swh/loader/cvs/loader.py
	@@ -1,339 +1,342 @@
	# Copyright (C) 2015-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	"""Loader in charge of injecting either new or existing cvs repositories to
	swh-storage.

	"""
	from datetime import datetime
	from mmap import ACCESS_WRITE, mmap
	import os
	import pty
	import re
	import shutil
	import subprocess
	import tempfile
	import time
	from typing import Dict, Iterator, List, Optional, Sequence, Tuple
	from urllib3.util import parse_url

	from swh.loader.core.loader import BaseLoader
	from swh.loader.core.utils import clean_dangling_folders
	from swh.loader.exception import NotFound
	import swh.loader.cvs.rcsparse as rcsparse
	from swh.loader.cvs.cvs2gitdump.cvs2gitdump import CvsConv, RcsKeywords, CHANGESET_FUZZ_SEC, file_path, ChangeSetKey
	from swh.model import from_disk, hashutil
	from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone
	from swh.model.model import (
	Content,
	Directory,
	Origin,
	Revision,
	SkippedContent,
	Snapshot,
	SnapshotBranch,
	TargetType,
	)
	from swh.storage.algos.snapshot import snapshot_get_latest
	from swh.storage.interface import StorageInterface

	DEFAULT_BRANCH = b"HEAD"

	TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs."


	class CvsLoader(BaseLoader):
	"""Swh cvs loader.

	The repository is local. The loader deals with
	update on an already previously loaded repository.

	"""

	visit_type = "cvs"

	def __init__(
	self,
	storage: StorageInterface,
	url: str,
	origin_url: Optional[str] = None,
	visit_date: Optional[datetime] = None,
	cvsroot_path: Optional[str] = None,
	swh_revision: Optional[str] = None,
	start_from_scratch: bool = False,
	temp_directory: str = "/tmp",
	debug: bool = False,
	check_revision: int = 0,
	max_content_size: Optional[int] = None,
	):
	super().__init__(
	storage=storage,
	logging_class="swh.loader.cvs.CvsLoader",
	max_content_size=max_content_size,
	)
	self.cvsroot_url = url
	# origin url as unique identifier for origin in swh archive
	self.origin_url = origin_url if origin_url else self.cvsroot_url
	self.debug = debug
	self.temp_directory = temp_directory
	self.done = False
	self.cvs_module_name = None
	self.cvs_module_path = None
	self.cvs_changesets = None
	self.rcs = RcsKeywords()
	# Revision check is configurable
	self.check_revision = check_revision
	# internal state used to store swh objects
	self._contents: List[Content] = []
	self._skipped_contents: List[SkippedContent] = []
	self._directories: List[Directory] = []
	self._revisions: List[Revision] = []
	self._snapshot: Optional[Snapshot] = None
	# internal state, current visit
	self._last_revision = None
	self._visit_status = "full"
	self._load_status = "uneventful"
	self.visit_date = visit_date
	self.cvsroot_path = cvsroot_path
	self.start_from_scratch = start_from_scratch
	self.snapshot = None
	# state from previous visit
	self.latest_snapshot = None
	self.latest_revision = None

	def prepare_origin_visit(self):
	self.origin = Origin(url=self.origin_url if self.origin_url else self.cvsroot_url)

	def cleanup(self):
	self.log.info("cleanup")

	def fetch_cvs_repo_with_rsync(self, host, path_on_server):
	# URL must end with a trailing slash in order to get CVSROOT listed
	url = 'rsync://%s%s/' % (host, path_on_server)
	rsync = subprocess.run(['rsync', url], capture_output=True, encoding='ascii')
	rsync.check_returncode()
	have_cvsroot = False
	have_module = False
	for line in rsync.stdout.split('\n'):
	self.log.debug("rsync server: %s" % line)
	if line.endswith(' CVSROOT'):
	have_cvsroot = True
	elif line.endswith(' %s' % self.cvs_module_name):
	have_module = True
	if have_module and have_cvsroot:
	break
	if not have_module:
	raise NotFound("CVS module %s not found at %s" \
	% (self.cvs_module_name, host, url))
	if not have_cvsroot:
	raise NotFound("No CVSROOT directory found at %s" % url)

	rsync = subprocess.run(['rsync', '-a', url, self.cvsroot_path])
	rsync.check_returncode()

	def prepare(self):
	if not self.cvsroot_path:
	self.cvsroot_path = tempfile.mkdtemp(
	suffix="-%s" % os.getpid(),
	prefix=TEMPORARY_DIR_PREFIX_PATTERN,
	dir=self.temp_directory,
	)
	self.worktree_path = tempfile.mkdtemp(
	suffix="-%s" % os.getpid(),
	prefix=TEMPORARY_DIR_PREFIX_PATTERN,
	dir=self.temp_directory,
	)
	url = parse_url(self.origin_url)
	self.log.debug("prepare; origin_url=%s scheme=%s path=%s" % (self.origin_url, url.scheme, url.path))
	if not url.path:
	raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
	self.cvs_module_name = os.path.basename(url.path)
	os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name));
	self.cvs_module_path = os.path.join(self.cvsroot_path, self.cvs_module_name)
	if url.scheme == 'file':
	if not os.path.exists(url.path):
	raise NotFound
	elif url.scheme == 'rsync':
	self.fetch_cvs_repo_with_rsync(url.host, url.path)
	else:
	raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
	have_rcsfile = False
	have_cvsroot = False
	for root, dirs, files in os.walk(self.cvsroot_path):
	if 'CVSROOT' in dirs:
	have_cvsroot = True
	dirs.remove('CVSROOT')
	continue;
	for f in files:
	filepath = os.path.join(root, f)
	if f[-2:] == ',v':
	try:
	rcsfile = rcsparse.rcsfile(filepath)
	except(Exception):
	raise
	else:
	self.log.debug("Looks like we have data to convert; "
	"found a valid RCS file at %s" % filepath)
	have_rcsfile = True
	break
	if have_rcsfile:
	break;

	if not have_rcsfile:
	raise NotFound("Directory %s does not contain any valid RCS files %s" % self.cvsroot_path)
	if not have_cvsroot:
	self.log.warn("The CVS repository at '%s' lacks a CVSROOT directory; "
	"we might be ingesting an incomplete copy of the repository" % self.cvsroot_path)

	def fetch_data(self):
	"""Fetch CVS revision information.

	Unfortunately, there is no way to convert CVS history in an iterative fashion
	because the data is not indexed by any kind of changeset ID. We need to walk
	the history of each and every RCS file in the repository during every visit,
	even if no new changes will be added to the SWH archive afterwards.
	"CVS’s repository is the software equivalent of a telephone book sorted by telephone number."
	https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/

	An implicit assumption made here is that self.cvs_changesets will fit into
	memory in its entirety. If it won't fit then the CVS walker will need to
	be modified such that it spools the list of changesets to disk instead.
	"""
	cvs = CvsConv(self.cvsroot_path, self.rcs, False, CHANGESET_FUZZ_SEC)
	self.log.debug("Walking CVS module %s", self.cvs_module_name)
	cvs.walk(self.cvs_module_name)
	self.cvs_changesets = sorted(cvs.changesets)
	self.log.info('CVS changesets found in %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
	+ if len(self.cvs_changesets) > 0:
	+ self._load_status = "eventful"

	def build_swh_revision(self,
	k: ChangeSetKey, dir_id: bytes, parents: Sequence[bytes]
	) -> Revision:
	"""Given a CVS revision, build a swh revision.

	Args:
	commit: the commit data: revision id, date, author, and message
	dir_id: the tree's hash identifier
	parents: the revision's parents identifier

	Returns:
	The swh revision dictionary.

	"""
	author = Person.from_fullname(k.author.encode('UTF-8'))
	date = TimestampWithTimezone.from_datetime(k.max_time)
	# XXX parsing the rcsfile twice, once in expand_keyword(), and again here
	rcs = rcsparse.rcsfile(k.revs[0].path)
	msg = rcs.getlog(k.revs[0].rev)

	return Revision(
	type=RevisionType.CVS,
	date=date,
	committer_date=date,
	directory=dir_id,
	message=msg,
	author=author,
	committer=author,
	synthetic=True,
	extra_headers=[],
	parents=tuple(parents))

	def generate_and_load_snapshot(
	self, revision: Optional[Revision] = None, snapshot: Optional[Snapshot] = None
	) -> Snapshot:
	"""Create the snapshot either from existing revision or snapshot.

	Revision (supposedly new) has priority over the snapshot
	(supposedly existing one).

	Args:
	revision (dict): Last revision seen if any (None by default)
	snapshot (dict): Snapshot to use if any (None by default)

	Returns:
	Optional[Snapshot] The newly created snapshot

	"""
	if revision: # Priority to the revision
	snap = Snapshot(
	branches={
	DEFAULT_BRANCH: SnapshotBranch(
	target=revision.id, target_type=TargetType.REVISION
	)
	}
	)
	elif snapshot: # Fallback to prior snapshot
	snap = snapshot
	else:
	raise ValueError(
	"generate_and_load_snapshot called with null revision and snapshot!"
	)
	self.log.debug("snapshot: %s" % snap)
	self.storage.snapshot_add([snap])
	return snap

	def store_data(self):
	"""Add CVS revisions to the archive.

	Compute SWH changeset IDs from CVS revision information and add new
	revisions to the archive.
	"""
	# XXX At present changeset IDs are recomputed on the fly during every visit.
	# If we were able to maintain a cached somewhere which can be indexed by a
	# cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid
	# doing a lot of redundant work during every visit.
	for k in self.cvs_changesets:
	tstr = time.strftime('%c', time.gmtime(k.max_time))
	self.log.debug("changeset from %s by %s on branch %s", tstr, k.author, k.branch);
	# Check out the on-disk state of this revision
	for f in k.revs:
	path = file_path(self.cvsroot_path, f.path)
	wtpath = os.path.join(self.worktree_path, path)
	self.log.debug("rev %s of file %s" % (f.rev, f.path));
	if f.state == 'dead':
	# remove this file from work tree
	try:
	os.remove(wtpath)
	except FileNotFoundError:
	pass
	else:
	# create, or update, this file in the work tree
	contents = self.rcs.expand_keyword(f.path, f.rev)
	try:
	outfile = open(wtpath, mode='wb')
	except FileNotFoundError:
	os.makedirs(os.path.dirname(wtpath))
	outfile = open(wtpath, mode='wb')
	outfile.write(contents)
	outfile.close()

	# Compute SWH revision from the on-disk state
	swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
	(content, skipped_content, directories) = from_disk.iter_directory(swh_dir)
	revision = self.build_swh_revision(k, swh_dir.hash, [])
	- self.log.debug("SWH revision: %s" % revision)
	+ self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))
	self._contents.extend(content)
	self._skipped_contents.extend(skipped_content)
	self._directories.extend(directories)
	self._revisions.append(revision)
	self._last_revision = revision

	self.storage.skipped_content_add(self._skipped_contents)
	self.storage.content_add(self._contents)
	self.storage.directory_add(self._directories)
	self.storage.revision_add(self._revisions)
	self.snapshot = self.generate_and_load_snapshot(
	revision=self._last_revision, snapshot=self._snapshot
	)
	+ self.log.debug("SWH snapshot ID: %s" % hashutil.hash_to_hex(self.snapshot.id))
	self.flush()
	self.loaded_snapshot_id = self.snapshot.id

	def load_status(self):
	return {
	"status": self._load_status,
	}

	def visit_status(self):
	return self._visit_status

	diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py
	index f835f9a..d59de15 100644
	--- a/swh/loader/cvs/tests/test_loader.py
	+++ b/swh/loader/cvs/tests/test_loader.py
	@@ -1,61 +1,71 @@
	# Copyright (C) 2016-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import os

	import pytest
	from swh.loader.cvs.loader import CvsLoader
	from swh.loader.tests import (
	assert_last_visit_matches,
	check_snapshot,
	get_stats,
	prepare_repository_from_archive,
	)
	from swh.model.hashutil import hash_to_bytes
	from swh.model.model import Snapshot, SnapshotBranch, TargetType

	+RUNBABY_SNAPSHOT = Snapshot(
	+ id=hash_to_bytes("1cff69ab9bd70822d5e3006092f943ccaafdcf57"),
	+ branches={
	+ b"HEAD": SnapshotBranch(
	+ target=hash_to_bytes("ef511d258fa55035c2bc2a5b05cad233cee1d328"),
	+ target_type=TargetType.REVISION,
	+ )
	+ },
	+)
	+
	def test_loader_cvs_not_found_no_mock(swh_storage, tmp_path):
	"""Given an unknown repository, the loader visit ends up in status not_found"""
	unknown_repo_url = "unknown-repository"
	loader = CvsLoader(swh_storage, unknown_repo_url, cvsroot_path=tmp_path)

	assert loader.load() == {"status": "uneventful"}

	assert_last_visit_matches(
	swh_storage, unknown_repo_url, status="not_found", type="cvs",
	)


	def test_loader_cvs_visit(swh_storage, datadir, tmp_path):
	"""Eventful visit should yield 1 snapshot"""
	archive_name = "runbaby"
	archive_path = os.path.join(datadir, f"{archive_name}.tgz")
	repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)

	loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))

	assert loader.load() == {"status": "eventful"}

	assert_last_visit_matches(
	loader.storage,
	repo_url,
	status="full",
	- type="svn",
	- snapshot=GOURMET_SNAPSHOT.id,
	+ type="cvs",
	+ snapshot=RUNBABY_SNAPSHOT.id,
	)

	stats = get_stats(loader.storage)
	assert stats == {
	- "content": 19,
	- "directory": 17,
	+ "content": 5,
	+ "directory": 2,
	"origin": 1,
	"origin_visit": 1,
	"release": 0,
	- "revision": 6,
	+ "revision": 1,
	"skipped_content": 0,
	"snapshot": 1,
	}

	#check_snapshot(GOURMET_SNAPSHOT, loader.storage)

File Metadata

Mime Type: text/x-diff
Expires: Jul 4 2025, 6:30 PM (5 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3270454

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions