diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py index 6cec698..0ff0f24 100644 --- a/swh/loader/cvs/cvsclient.py +++ b/swh/loader/cvs/cvsclient.py @@ -1,417 +1,433 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information """Minimal CVS client implementation """ import os.path import socket import subprocess import tempfile from swh.loader.exception import NotFound CVS_PSERVER_PORT = 2401 CVS_PROTOCOL_BUFFER_SIZE = 8192 EXAMPLE_PSERVER_URL = "pserver://user:password@cvs.example.com/cvsroot/repository" EXAMPLE_SSH_URL = "ssh://user@cvs.example.com/cvsroot/repository" VALID_RESPONSES = [ "ok", "error", "Valid-requests", "Checked-in", "New-entry", "Checksum", "Copy-file", "Updated", "Created", "Update-existing", "Merged", "Patched", "Rcs-diff", "Mode", "Removed", "Remove-entry", "Template", "Notified", "Module-expansion", "Wrapper-rcsOption", "M", "Mbinary", "E", "F", "MT", ] # Trivially encode strings to protect them from innocent eyes (i.e., # inadvertent password compromises, like a network administrator # who's watching packets for legitimate reasons and accidentally sees # the password protocol go by). # # This is NOT secure encryption. def scramble_password(password): s = ["A"] # scramble scheme version number # fmt: off scramble_shifts = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, # noqa: E241 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, # noqa: E241,E131,E501 114,120, 53, 79, 96,109, 72,108, 70, 64, 76, 67,116, 74, 68, 87, # noqa: E241,E131,E501 111, 52, 75,119, 49, 34, 82, 81, 95, 65,112, 86,118,110,122,105, # noqa: E241,E131,E501 41, 57, 83, 43, 46,102, 40, 89, 38,103, 45, 50, 42,123, 91, 35, # noqa: E241,E131,E501 125, 55, 54, 66,124,126, 59, 47, 92, 71,115, 78, 88,107,106, 56, # noqa: E241,E131,E501 36,121,117,104,101,100, 69, 73, 99, 63, 94, 93, 39, 37, 61, 48, # noqa: E241,E131,E501 58,113, 32, 90, 44, 98, 60, 51, 33, 97, 62, 77, 84, 80, 85,223, # noqa: E241,E131,E501 225,216,187,166,229,189,222,188,141,249,148,200,184,136,248,190, # noqa: E241,E131,E501 199,170,181,204,138,232,218,183,255,234,220,247,213,203,226,193, # noqa: E241,E131,E501 174,172,228,252,217,201,131,230,197,211,145,238,161,179,160,212, # noqa: E241,E131,E501 207,221,254,173,202,146,224,151,140,196,205,130,135,133,143,246, # noqa: E241,E131,E501 192,159,244,239,185,168,215,144,139,165,180,157,147,186,214,176, # noqa: E241,E131,E501 227,231,219,169,175,156,206,198,129,164,150,210,154,177,134,127, # noqa: E241,E131,E501 182,128,158,208,162,132,167,209,149,241,153,251,237,236,171,195, # noqa: E241,E131,E501 243,233,253,240,194,250,191,155,142,137,245,235,163,242,178,152] # noqa: E241,E131,E501 # fmt: on for c in password: s.append("%c" % scramble_shifts[ord(c)]) return "".join(s) class CVSProtocolError(Exception): pass class CVSClient: def connect_pserver(self, hostname, port, auth): if port is None: port = CVS_PSERVER_PORT if auth is None: raise NotFound( "Username and password are required for " "a pserver connection: %s" % EXAMPLE_PSERVER_URL ) try: user = auth.split(":")[0] password = auth.split(":")[1] except IndexError: raise NotFound( "Username and password are required for " "a pserver connection: %s" % EXAMPLE_PSERVER_URL ) try: self.socket = socket.create_connection((hostname, port)) except ConnectionRefusedError: raise NotFound("Could not connect to %s:%s", hostname, port) scrambled_password = scramble_password(password) request = "BEGIN AUTH REQUEST\n%s\n%s\n%s\nEND AUTH REQUEST\n" % ( self.cvsroot_path, user, scrambled_password, ) print("Request: %s\n" % request) self.socket.sendall(request.encode("UTF-8")) response = self.conn_read_line() if response != b"I LOVE YOU\n": raise NotFound( "pserver authentication failed for %s:%s: %s" % (hostname, port, response) ) def connect_ssh(self, hostname, port, auth): command = ["ssh"] if auth is not None: # Assume 'auth' contains only a user name. # We do not support password authentication with SSH since the # anoncvs user is usually granted access without a password. command += ["-l", "%s" % auth] if port is not None: command += ["-p", "%d" % port] # accept new SSH hosts keys upon first use; changed host keys # will require intervention command += ["-o", "StrictHostKeyChecking=accept-new"] # disable interactive prompting command += ["-o", "BatchMode=yes"] # disable further option processing by adding '--' command += ["--"] command += ["%s" % hostname, "cvs", "server"] # use non-buffered I/O to match behaviour of self.socket self.ssh = subprocess.Popen( command, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE ) def connect_fake(self, hostname, port, auth): command = ["cvs", "server"] # use non-buffered I/O to match behaviour of self.socket self.ssh = subprocess.Popen( command, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE ) def conn_read_line(self, require_newline=True): if len(self.linebuffer) != 0: return self.linebuffer.pop(0) buf = b"" idx = -1 while idx == -1: if len(buf) >= CVS_PROTOCOL_BUFFER_SIZE: if require_newline: raise CVSProtocolError( "Overlong response from " "CVS server: %s" % buf ) else: break if self.socket: buf += self.socket.recv(CVS_PROTOCOL_BUFFER_SIZE) elif self.ssh: buf += self.ssh.stdout.read(CVS_PROTOCOL_BUFFER_SIZE) else: raise Exception("No valid connection") if not buf: return None idx = buf.rfind(b"\n") if idx != -1: self.linebuffer = buf[: idx + 1].splitlines(keepends=True) else: if require_newline: raise CVSProtocolError("Invalid response from CVS server: %s" % buf) else: self.linebuffer.append(buf) if len(self.incomplete_line) > 0: self.linebuffer[0] = self.incomplete_line + self.linebuffer[0] if idx != -1: self.incomplete_line = buf[idx + 1 :] else: self.incomplete_line = b"" return self.linebuffer.pop(0) def conn_write(self, data): if self.socket: return self.socket.sendall(data) if self.ssh: self.ssh.stdin.write(data) return self.ssh.stdin.flush() raise Exception("No valid connection") def conn_write_str(self, s): return self.conn_write(s.encode("UTF-8")) def conn_close(self): if self.socket: self.socket.close() if self.ssh: self.ssh.kill() try: self.ssh.wait(timeout=10) except subprocess.TimeoutExpired as e: raise subprocess.TimeoutExpired( "Could not terminate " "ssh program: %s" % e ) def __init__(self, url): """ Connect to a CVS server at the specified URL and perform the initial CVS protocol handshake. """ self.hostname = url.host self.cvsroot_path = os.path.dirname(url.path) self.cvs_module_name = os.path.basename(url.path) self.socket = None self.ssh = None self.linebuffer = list() self.incomplete_line = b"" if url.scheme == "pserver": self.connect_pserver(url.host, url.port, url.auth) elif url.scheme == "ssh": self.connect_ssh(url.host, url.port, url.auth) elif url.scheme == "fake": self.connect_fake(url.host, url.port, url.auth) else: raise NotFound("Invalid CVS origin URL '%s'" % url) # we should have a connection now assert self.socket or self.ssh self.conn_write_str( "Root %s\nValid-responses %s\nvalid-requests\n" "UseUnchanged\n" % (self.cvsroot_path, " ".join(VALID_RESPONSES)) ) response = self.conn_read_line() if not response: raise CVSProtocolError("No response from CVS server") try: if response[0:15] != b"Valid-requests ": raise CVSProtocolError( "Invalid response from " "CVS server: %s" % response ) except IndexError: raise CVSProtocolError("Invalid response from CVS server: %s" % response) response = self.conn_read_line() if response != b"ok\n": raise CVSProtocolError("Invalid response from CVS server: %s" % response) def __del__(self): self.conn_close() def _parse_rlog_response(self, fp): rlog_output = tempfile.TemporaryFile() expect_error = False for line in fp.readlines(): if expect_error: raise CVSProtocolError("CVS server error: %s" % line) if line == b"ok\n": break elif line == b"M \n": continue elif line[0:2] == b"M ": rlog_output.write(line[2:]) elif line[0:8] == b"MT text ": rlog_output.write(line[8:-1]) elif line[0:8] == b"MT date ": rlog_output.write(line[8:-1]) elif line[0:10] == b"MT newline": rlog_output.write(line[10:]) elif line[0:7] == b"error ": expect_error = True continue else: raise CVSProtocolError("Bad CVS protocol response: %s" % line) rlog_output.seek(0) return rlog_output - def fetch_rlog(self): + def fetch_rlog(self, path="", state=""): + path_arg = path or self.cvs_module_name + if len(state) > 0: + state_arg = "Argument -s%s\n" % state + else: + state_arg = "" fp = tempfile.TemporaryFile() self.conn_write_str( - "Global_option -q\nArgument --\nArgument %s\nrlog\n" % self.cvs_module_name + "Global_option -q\n" + f"{state_arg}" + "Argument --\n" + f"Argument {path_arg}\n" + "rlog\n" ) while True: response = self.conn_read_line() if response is None: raise CVSProtocolError("No response from CVS server") if response[0:2] == b"E ": + if len(path) > 0 and response[-11:] == b" - ignored\n": + response = self.conn_read_line() + if response != b"error \n": + raise CVSProtocolError( + "Invalid response from CVS server: %s" % response + ) + return None # requested path does not exist (ignore) raise CVSProtocolError("Error response from CVS server: %s" % response) fp.write(response) if response == b"ok\n": break fp.seek(0) return self._parse_rlog_response(fp) def checkout(self, path, rev, dest_dir, expand_keywords): """ Download a file revision from the cvs server and store the file's contents in a temporary file. If expand_keywords is set then ask the server to expand RCS keywords in file content. From the server's point of view this function behaves much like 'cvs update -r rev path'. The server is unaware that we do not actually maintain a CVS working copy. Because of this it sends more information than we need. We simply skip responses that are of no interest to us. """ skip_line = False expect_modeline = False expect_bytecount = False have_bytecount = False bytecount = 0 dirname = os.path.dirname(path) if dirname: self.conn_write_str("Directory %s\n%s\n" % (dirname, dirname)) filename = os.path.basename(path) co_output = tempfile.NamedTemporaryFile( dir=dest_dir, delete=True, prefix="cvsclient-checkout-%s-r%s-" % (filename, rev), ) if expand_keywords: # use server-side per-file default expansion rules karg = "" else: # force binary file mode karg = "Argument -kb\n" # TODO: cvs <= 1.10 servers expect to be given every Directory along the path. self.conn_write_str( "Directory %s\n%s\n" "Global_option -q\n" "Argument -r%s\n" "%s" "Argument --\nArgument %s\nco \n" % (self.cvs_module_name, self.cvs_module_name, rev, karg, path) ) while True: if have_bytecount: if bytecount < 0: raise CVSProtocolError("server sent too much file content data") response = self.conn_read_line(require_newline=False) if response is None: raise CVSProtocolError("Incomplete response from CVS server") if len(response) > bytecount: # When a file lacks a final newline we receive a line which # contains file content as well as CVS protocol response data. # Split last line of file content from CVS protocol data... co_output.write(response[:bytecount]) response = response[bytecount:] bytecount = 0 # ...and process the CVS protocol response below. else: co_output.write(response) bytecount -= len(response) continue else: response = self.conn_read_line() if response[0:2] == b"E ": raise CVSProtocolError("Error from CVS server: %s" % response) if response == b"ok\n": if have_bytecount: break else: raise CVSProtocolError("server sent 'ok' but no file contents") if skip_line: skip_line = False continue elif expect_bytecount: try: bytecount = int(response[0:-1]) # strip trailing \n except ValueError: raise CVSProtocolError("Bad CVS protocol response: %s" % response) have_bytecount = True continue elif response in (b"M \n", b"MT +updated\n", b"MT -updated\n"): continue elif response[0:9] == b"MT fname ": continue elif response.split(b" ")[0] in ( b"Created", b"Checked-in", b"Update-existing", b"Updated", b"Removed", ): skip_line = True continue elif response[0:1] == b"/": expect_modeline = True continue elif expect_modeline and response[0:2] == b"u=": expect_modeline = False expect_bytecount = True continue elif response[0:2] == b"M ": continue elif response[0:8] == b"MT text ": continue elif response[0:10] == b"MT newline": continue else: raise CVSProtocolError("Bad CVS protocol response: %s" % response) co_output.seek(0) return co_output diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py index f7dcb80..fa5279b 100644 --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -1,482 +1,534 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information """Loader in charge of injecting either new or existing cvs repositories to swh-storage. """ from datetime import datetime import os +import os.path import subprocess import tempfile import time -from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple +from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast from urllib3.util import parse_url from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ( CHANGESET_FUZZ_SEC, ChangeSetKey, CvsConv, FileRevision, RcsKeywords, file_path, ) from swh.loader.cvs.cvsclient import CVSClient import swh.loader.cvs.rcsparse as rcsparse from swh.loader.cvs.rlog import RlogConv from swh.loader.exception import NotFound from swh.model import from_disk, hashutil from swh.model.model import ( Content, Directory, Origin, Person, Revision, RevisionType, Sha1Git, SkippedContent, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface DEFAULT_BRANCH = b"HEAD" TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs." class CvsLoader(BaseLoader): """Swh cvs loader. The repository is local. The loader deals with update on an already previously loaded repository. """ visit_type = "cvs" cvs_module_name: str cvsclient: CVSClient # remote CVS repository access (history is parsed from CVS rlog): rlog_file: BinaryIO swh_revision_gen: Iterator[ Tuple[List[Content], List[SkippedContent], List[Directory], Revision] ] def __init__( self, storage: StorageInterface, url: str, origin_url: Optional[str] = None, visit_date: Optional[datetime] = None, cvsroot_path: Optional[str] = None, temp_directory: str = "/tmp", max_content_size: Optional[int] = None, ): super().__init__( storage=storage, logging_class="swh.loader.cvs.CvsLoader", max_content_size=max_content_size, ) self.cvsroot_url = url # origin url as unique identifier for origin in swh archive self.origin_url = origin_url if origin_url else self.cvsroot_url self.temp_directory = temp_directory # internal state used to store swh objects self._contents: List[Content] = [] self._skipped_contents: List[SkippedContent] = [] self._directories: List[Directory] = [] self._revisions: List[Revision] = [] # internal state, current visit self._last_revision: Optional[Revision] = None self._visit_status = "full" self.visit_date = visit_date - - if not cvsroot_path: - cvsroot_path = tempfile.mkdtemp( - suffix="-%s" % os.getpid(), - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - dir=self.temp_directory, - ) self.cvsroot_path = cvsroot_path self.snapshot: Optional[Snapshot] = None self.last_snapshot: Optional[Snapshot] = snapshot_get_latest( self.storage, self.origin_url ) def compute_swh_revision( self, k: ChangeSetKey, logmsg: Optional[bytes] ) -> Tuple[Revision, from_disk.Directory]: """Compute swh hash data per CVS changeset. Returns: tuple (rev, swh_directory) - rev: current SWH revision computed from checked out work tree - swh_directory: dictionary of path, swh hash data with type """ # Compute SWH revision from the on-disk state swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path)) parents: Tuple[Sha1Git, ...] if self._last_revision: parents = (self._last_revision.id,) else: parents = () revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) self._last_revision = revision return (revision, swh_dir) def checkout_file_with_rcsparse( self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile ) -> None: + assert self.cvsroot_path path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.worktree_path, path) self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) if f.state == "dead": # remove this file from work tree try: os.remove(wtpath) except FileNotFoundError: pass else: # create, or update, this file in the work tree if not rcsfile: rcsfile = rcsparse.rcsfile(f.path) rcs = RcsKeywords() contents = rcs.expand_keyword(f.path, rcsfile, f.rev) os.makedirs(os.path.dirname(wtpath), exist_ok=True) outfile = open(wtpath, mode="wb") outfile.write(contents) outfile.close() def checkout_file_with_cvsclient( self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient ): + assert self.cvsroot_path path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.worktree_path, path) self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) if f.state == "dead": # remove this file from work tree try: os.remove(wtpath) except FileNotFoundError: pass else: dirname = os.path.dirname(wtpath) os.makedirs(dirname, exist_ok=True) self.log.debug("checkout to %s\n" % wtpath) - fp = cvsclient.checkout(f.path, f.rev, dirname, expand_keywords=True) + fp = cvsclient.checkout(path, f.rev, dirname, expand_keywords=True) os.rename(fp.name, wtpath) try: fp.close() except FileNotFoundError: # Well, we have just renamed the file... pass def process_cvs_changesets( self, cvs_changesets: List[ChangeSetKey], use_rcsparse: bool, ) -> Iterator[ Tuple[List[Content], List[SkippedContent], List[Directory], Revision] ]: """Process CVS revisions. At each CVS revision, check out contents and compute swh hashes. Yields: tuple (contents, skipped-contents, directories, revision) of dict as a dictionary with keys, sha1_git, sha1, etc... """ for k in cvs_changesets: tstr = time.strftime("%c", time.gmtime(k.max_time)) self.log.info( "changeset from %s by %s on branch %s", tstr, k.author, k.branch ) logmsg: Optional[bytes] = b"" # Check out all files of this revision and get a log message. # # The log message is obtained from the first file in the changeset. # The message will usually be the same for all affected files, and # the SWH archive will only store one version of the log message. for f in k.revs: rcsfile = None if use_rcsparse: if rcsfile is None: rcsfile = rcsparse.rcsfile(f.path) if not logmsg: logmsg = rcsfile.getlog(k.revs[0].rev) self.checkout_file_with_rcsparse(k, f, rcsfile) else: if not logmsg: logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev) self.checkout_file_with_cvsclient(k, f, self.cvsclient) # TODO: prune empty directories? (revision, swh_dir) = self.compute_swh_revision(k, logmsg) (contents, skipped_contents, directories) = from_disk.iter_directory( swh_dir ) yield contents, skipped_contents, directories, revision def prepare_origin_visit(self) -> None: self.origin = Origin( url=self.origin_url if self.origin_url else self.cvsroot_url ) def pre_cleanup(self) -> None: """Cleanup potential dangling files from prior runs (e.g. OOM killed tasks) """ clean_dangling_folders( self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log, ) def cleanup(self) -> None: self.log.info("cleanup") def fetch_cvs_repo_with_rsync(self, host: str, path: str) -> None: # URL *must* end with a trailing slash in order to get CVSROOT listed url = "rsync://%s%s/" % (host, os.path.dirname(path)) rsync = subprocess.run(["rsync", url], capture_output=True, encoding="ascii") rsync.check_returncode() have_cvsroot = False have_module = False for line in rsync.stdout.split("\n"): self.log.debug("rsync server: %s", line) if line.endswith(" CVSROOT"): have_cvsroot = True elif line.endswith(" %s" % self.cvs_module_name): have_module = True if have_module and have_cvsroot: break if not have_module: raise NotFound( "CVS module %s not found at %s" % (self.cvs_module_name, url) ) if not have_cvsroot: raise NotFound("No CVSROOT directory found at %s" % url) - subprocess.run(["rsync", "-a", url, self.cvsroot_path]).check_returncode() + # mypy complains: List item 3 has incompatible type "Optional[str]"; + # because self.cvsroot_path is an optional argument. We do however + # ensure that it is initialized if the loader is not passed a + # corresponding argument. Better ideas than ignoring types on this line? + subprocess.run( + ["rsync", "-a", url, self.cvsroot_path] # type: ignore + ).check_returncode() def prepare(self) -> None: self._last_revision = None self.worktree_path = tempfile.mkdtemp( suffix="-%s" % os.getpid(), prefix=TEMPORARY_DIR_PREFIX_PATTERN, dir=self.temp_directory, ) url = parse_url(self.origin_url) self.log.debug( "prepare; origin_url=%s scheme=%s path=%s", self.origin_url, url.scheme, url.path, ) if not url.path: raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) self.cvs_module_name = os.path.basename(url.path) os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name)) - if url.scheme == "file": - if not os.path.exists(url.path): - raise NotFound - elif url.scheme == "rsync": - self.fetch_cvs_repo_with_rsync(url.host, url.path) - if url.scheme == "file" or url.scheme == "rsync": # local CVS repository conversion + if not self.cvsroot_path: + self.cvsroot_path = tempfile.mkdtemp( + suffix="-%s" % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=self.temp_directory, + ) + if url.scheme == "file": + if not os.path.exists(url.path): + raise NotFound + elif url.scheme == "rsync": + self.fetch_cvs_repo_with_rsync(url.host, url.path) + have_rcsfile = False have_cvsroot = False for root, dirs, files in os.walk(self.cvsroot_path): if "CVSROOT" in dirs: have_cvsroot = True dirs.remove("CVSROOT") continue for f in files: filepath = os.path.join(root, f) if f[-2:] == ",v": rcsfile = rcsparse.rcsfile(filepath) # noqa: F841 self.log.debug( "Looks like we have data to convert; " "found a valid RCS file at %s", filepath, ) have_rcsfile = True break if have_rcsfile: break if not have_rcsfile: raise NotFound( "Directory %s does not contain any valid RCS files %s", self.cvsroot_path, ) if not have_cvsroot: self.log.warn( "The CVS repository at '%s' lacks a CVSROOT directory; " "we might be ingesting an incomplete copy of the repository", self.cvsroot_path, ) # Unfortunately, there is no way to convert CVS history in an # iterative fashion because the data is not indexed by any kind # of changeset ID. We need to walk the history of each and every # RCS file in the repository during every visit, even if no new # changes will be added to the SWH archive afterwards. # "CVS’s repository is the software equivalent of a telephone book # sorted by telephone number." # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/ # # An implicit assumption made here is that self.cvs_changesets will # fit into memory in its entirety. If it won't fit then the CVS walker # will need to be modified such that it spools the list of changesets # to disk instead. cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC) self.log.info("Walking CVS module %s", self.cvs_module_name) cvs.walk(self.cvs_module_name) cvs_changesets = sorted(cvs.changesets) self.log.info( "CVS changesets found in %s: %d", self.cvs_module_name, len(cvs_changesets), ) self.swh_revision_gen = self.process_cvs_changesets( cvs_changesets, use_rcsparse=True ) elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": # remote CVS repository conversion + if not self.cvsroot_path: + self.cvsroot_path = os.path.dirname(url.path) self.cvsclient = CVSClient(url) cvsroot_path = os.path.dirname(url.path) self.log.info( "Fetching CVS rlog from %s:%s/%s", url.host, cvsroot_path, self.cvs_module_name, ) self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) - self.rlog_file = self.cvsclient.fetch_rlog() - self.rlog.parse_rlog(self.rlog_file) + main_rlog_file = self.cvsclient.fetch_rlog() + self.rlog.parse_rlog(main_rlog_file) + # Find file deletion events only visible in Attic directories. + main_changesets = self.rlog.changesets + attic_paths = [] + attic_rlog_files = [] + assert self.cvsroot_path + for k in main_changesets: + for changed_file in k.revs: + path = file_path(self.cvsroot_path, changed_file.path) + if path.startswith(self.cvsroot_path): + path = path[ + len(os.path.commonpath([self.cvsroot_path, path])) + 1 : + ] + parent_path = os.path.dirname(path) + + if parent_path.split("/")[-1] == "Attic": + continue + attic_path = parent_path + "/Attic" + if attic_path in attic_paths: + continue + attic_paths.append(attic_path) # avoid multiple visits + # Try to fetch more rlog data from this Attic directory. + attic_rlog_file = self.cvsclient.fetch_rlog( + path=attic_path, state="dead", + ) + if attic_rlog_file: + attic_rlog_files.append(attic_rlog_file) + if len(attic_rlog_files) == 0: + self.rlog_file = main_rlog_file + else: + # Combine all the rlog pieces we found and re-parse. + fp = tempfile.TemporaryFile() + for attic_rlog_file in attic_rlog_files: + for line in attic_rlog_file.readlines(): + fp.write(line) + attic_rlog_file.close() + main_rlog_file.seek(0) + for line in main_rlog_file.readlines(): + fp.write(line) + main_rlog_file.close() + fp.seek(0) + self.rlog.parse_rlog(cast(BinaryIO, fp)) + self.rlog_file = cast(BinaryIO, fp) cvs_changesets = sorted(self.rlog.changesets) self.log.info( "CVS changesets found for %s: %d", self.cvs_module_name, len(cvs_changesets), ) self.swh_revision_gen = self.process_cvs_changesets( cvs_changesets, use_rcsparse=False ) else: raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) def fetch_data(self) -> bool: """Fetch the next CVS revision.""" try: data = next(self.swh_revision_gen) except StopIteration: return False except Exception: self.log.exception("Exception in fetch_data:") return False # Stopping iteration self._contents, self._skipped_contents, self._directories, rev = data self._revisions = [rev] return True def build_swh_revision( self, k: ChangeSetKey, logmsg: Optional[bytes], dir_id: bytes, parents: Sequence[bytes], ) -> Revision: """Given a CVS revision, build a swh revision. Args: k: changeset data logmsg: the changeset's log message dir_id: the tree's hash identifier parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = Person.from_fullname(k.author.encode("UTF-8")) date = TimestampWithTimezone.from_dict(k.max_time) return Revision( type=RevisionType.CVS, date=date, committer_date=date, directory=dir_id, message=logmsg, author=author, committer=author, synthetic=True, extra_headers=[], parents=tuple(parents), ) def generate_and_load_snapshot(self, revision: Revision) -> Snapshot: """Create the snapshot either from existing revision. Args: revision (dict): Last revision seen if any (None by default) Returns: Optional[Snapshot] The newly created snapshot """ snap = Snapshot( branches={ DEFAULT_BRANCH: SnapshotBranch( target=revision.id, target_type=TargetType.REVISION ) } ) self.log.debug("snapshot: %s", snap) self.storage.snapshot_add([snap]) return snap def store_data(self) -> None: "Add our current CVS changeset to the archive." self.storage.skipped_content_add(self._skipped_contents) self.storage.content_add(self._contents) self.storage.directory_add(self._directories) self.storage.revision_add(self._revisions) assert self._last_revision is not None self.snapshot = self.generate_and_load_snapshot(self._last_revision) self.log.debug("SWH snapshot ID: %s", hashutil.hash_to_hex(self.snapshot.id)) self.flush() self.loaded_snapshot_id = self.snapshot.id self._skipped_contents = [] self._contents = [] self._directories = [] self._revisions = [] def load_status(self) -> Dict[str, Any]: assert self.snapshot is not None if self.last_snapshot == self.snapshot: load_status = "uneventful" else: load_status = "eventful" return { "status": load_status, } def visit_status(self) -> str: return self._visit_status diff --git a/swh/loader/cvs/rlog.py b/swh/loader/cvs/rlog.py index 4e2dd2c..e09ee0b 100644 --- a/swh/loader/cvs/rlog.py +++ b/swh/loader/cvs/rlog.py @@ -1,494 +1,499 @@ """ RCS/CVS rlog parser, derived from viewvc and cvs2gitdump.py """ # Copyright (C) 1999-2021 The ViewCVS Group. All Rights Reserved. # # By using ViewVC, you agree to the terms and conditions set forth # below: # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following # disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # Copyright (c) 2012 YASUOKA Masahiko # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. import calendar import re import time from typing import BinaryIO, Dict, List, NamedTuple, Optional, Tuple -from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path +from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey # There is no known encoding of path names in CVS. The actual encoding used # will depend on the CVS server's operating system and perhaps even the # underlying filesystem used to host a CVS repository. # It is even conceivable that a given repository may use multiple encodings, # e.g. due to migrations of the repository between different servers over time. # # This issue also affects the CVS network protocol which is communicating # paths between the CVS server and the CVS client. For this reason, most # public-facing repositories should stick to ASCII in practice. # # TODO: If known, the actual path encoding used by the repository should # be specified as a parameter. This parameter should be a list since # multiple encodings may be present in a given repository. path_encodings = ["ascii", "utf-8"] class revtuple(NamedTuple): number: str date: int author: bytes state: str branches: None revnumstr: None commitid: None class RlogConv: def __init__(self, cvsroot_path: str, fuzzsec: int) -> None: self.cvsroot_path = cvsroot_path self.fuzzsec = fuzzsec self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() self.tags: Dict[str, ChangeSetKey] = dict() self.offsets: Dict[str, Dict[str, int]] = dict() def _process_rlog_revisions( self, path: str, taginfo: Dict[bytes, bytes], revisions: Dict[str, revtuple], - logmsgs: Dict[str, Optional[bytes]] + logmsgs: Dict[str, Optional[bytes]], ) -> None: """ Convert RCS revision history of a file into self.changesets items """ rtags: Dict[str, List[str]] = dict() # RCS and CVS represent branches by adding digits to revision numbers. # And CVS assigns special meaning to certain revision number ranges. # # Revision numbers on the main branch have only two digits: # # 1.1, 1.2, 1.3, ... # # Branches created with 'cvs tag -b' use even numbers for # the third digit: # # 1.1, 1.2, 1.3, ... main branch history of the file # | # 1.1.2.1, 1.1.2.2 ... a branch (2) forked off r1.1 of the file # # Branches are given human-readable names by associating # RCS tag labels with their revision numbers. # Given a file on the above branch which has been changed 10 times # since history was forked, the branch tag would look like this: # # MY_BRANCH: r1.1.2.10 # # Odd branch numbers are reserved for CVS "vendor" branches. # The default vendor branch is 1.1.1. # Vendor branches are populated with 'cvs import'. # Files on the vendor branch are merged to the main branch automatically # unless there are merge conflicts. Such conflicts have to be resolved # manually each time 'cvs import' is used to update the vendor branch. # # See here for details: # https://www.gnu.org/software/trans-coord/manual/cvs/html_node/Branches-and-revisions.html#Branches-and-revisions # # There are also "magic" branch numbers with a zero inserted # at the second-rightmost position: # # 1.1, 1.2, 1.3, ... main branch history of the file # | # 1.1.2.0.1 magic branch (2) # # This allows CVS to store information about a branch's existence # before any files on this branch have been modified. # Even-numbered branch revisions appear once the file is modified. branches = {"1": "HEAD", "1.1.1": "VENDOR"} k: str v_: str for k, v_ in list(taginfo.items()): # type: ignore # FIXME, inconsistent types r = v_.split(".") if len(r) == 3: # vendor branch number branches[v_] = "VENDOR" elif len(r) >= 3 and r[-2] == "0": # magic branch number branches[".".join(r[:-2] + r[-1:])] = k if len(r) == 2 and branches[r[0]] == "HEAD": # main branch number if v_ not in rtags: rtags[v_] = list() rtags[v_].append(k) revs: List[Tuple[str, revtuple]] = list(revisions.items()) # sort by revision descending to priorize 1.1.1.1 than 1.1 revs.sort(key=lambda a: a[1][0], reverse=True) # sort by time revs.sort(key=lambda a: a[1][1]) novendor = False have_initial_revision = False last_vendor_status = None for k, v in revs: r = k.split(".") if ( len(r) == 4 and r[0] == "1" and r[1] == "1" and r[2] == "1" and r[3] == "1" ): if have_initial_revision: continue if v[3] == "dead": continue last_vendor_status = v[3] have_initial_revision = True elif len(r) == 4 and r[0] == "1" and r[1] == "1" and r[2] == "1": if novendor: continue last_vendor_status = v[3] elif len(r) == 2: if r[0] == "1" and r[1] == "1": if have_initial_revision: continue if v[3] == "dead": continue have_initial_revision = True elif r[0] == "1" and r[1] != "1": novendor = True if last_vendor_status == "dead" and v[3] == "dead": last_vendor_status = None continue last_vendor_status = None else: # trunk only continue b = ".".join(r[:-1]) # decode author name in a potentially lossy way; # it is only used for internal hashing in this case author = v[2].decode("utf-8", "ignore") logmsg = logmsgs[k] assert logmsg is not None a = ChangeSetKey(branches[b], author, v[1], logmsg, v[6], self.fuzzsec) a.put_file(path, k, v[3], 0) while a in self.changesets: c = self.changesets[a] del self.changesets[a] c.merge(a) a = c self.changesets[a] = a if k in rtags: for t in rtags[k]: if t not in self.tags or self.tags[t].max_time < a.max_time: self.tags[t] = a def parse_rlog(self, fp: BinaryIO) -> None: + self.changesets = dict() + self.tags = dict() + self.offsets = dict() eof = None while eof != _EOF_LOG and eof != _EOF_ERROR: filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp) revisions: Dict[str, revtuple] = {} logmsgs: Dict[str, Optional[bytes]] = {} path = "" if filename: # There is no known encoding of filenames in CVS. # Attempt to decode the path with our list of known encodings. # If none of them work, forcefully decode the path assuming # the final path encoding provided in the list. for i, e in enumerate(path_encodings): try: how = "ignore" if i == len(path_encodings) - 1 else "strict" fname = filename.decode(e, how) break except UnicodeError: pass - path = file_path(self.cvsroot_path, fname) + path = fname elif not eof: raise ValueError("No filename found in rlog header") while not eof: off = fp.tell() rev, logmsg, eof = _parse_log_entry(fp) if rev: revisions[rev[0]] = rev logmsgs[rev[0]] = logmsg if eof != _EOF_LOG and eof != _EOF_ERROR: if path not in self.offsets.keys(): self.offsets[path] = dict() if rev: self.offsets[path][rev[0]] = off self._process_rlog_revisions(path, taginfo, revisions, logmsgs) def getlog(self, fp: BinaryIO, path: str, rev: str) -> Optional[bytes]: off = self.offsets[path][rev] fp.seek(off) _rev, logmsg, eof = _parse_log_entry(fp) return logmsg # if your rlog doesn't use 77 '=' characters, then this must change LOG_END_MARKER = b"=" * 77 + b"\n" ENTRY_END_MARKER = b"-" * 28 + b"\n" _EOF_FILE = b"end of file entries" # no more entries for this RCS file _EOF_LOG = b"end of log" # hit the true EOF on the pipe _EOF_ERROR = b"error message found" # rlog issued an error # rlog error messages look like # # rlog: filename/goes/here,v: error message # rlog: filename/goes/here,v:123: error message # # so we should be able to match them with a regex like # # ^rlog\: (.*)(?:\:\d+)?\: (.*)$ # # But for some reason the windows version of rlog omits the "rlog: " prefix # for the first error message when the standard error stream has been # redirected to a file or pipe. (the prefix is present in subsequent errors # and when rlog is run from the console). So the expression below is more # complicated _re_log_error = re.compile(rb"^(?:rlog\: )*(.*,v)(?:\:\d+)?\: (.*)$") # CVSNT error messages look like: # cvs rcsfile: `C:/path/to/file,v' does not appear to be a valid rcs file # cvs [rcsfile aborted]: C:/path/to/file,v: No such file or directory # cvs [rcsfile aborted]: cannot open C:/path/to/file,v: Permission denied _re_cvsnt_error = re.compile( rb"^(?:cvs rcsfile\: |cvs \[rcsfile aborted\]: )" rb"(?:\`(.*,v)' |" rb"cannot open (.*,v)\: |(.*,v)\: |)" rb"(.*)$" ) -def _parse_log_header(fp: BinaryIO) -> Tuple[ - bytes, bytes, Dict[bytes, bytes], Dict[bytes, bytes], bytes, Optional[bytes] +def _parse_log_header( + fp: BinaryIO, +) -> Tuple[ + bytes, bytes, Dict[bytes, bytes], Dict[bytes, bytes], bytes, Optional[bytes] ]: """Parse and RCS/CVS log header. fp is a file (pipe) opened for reading the log information. On entry, fp should point to the start of a log entry. On exit, fp will have consumed the separator line between the header and the first revision log. If there is no revision information (e.g. the "-h" switch was passed to rlog), then fp will consumed the file separator line on exit. Returns: filename, default branch, tag dictionary, lock dictionary, rlog error message, and eof flag """ filename = branch = msg = b"" taginfo: Dict[bytes, bytes] = {} # tag name => number lockinfo: Dict[bytes, bytes] = {} # revision => locker state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks eof = None while 1: line = fp.readline() if not line: # the true end-of-file eof = _EOF_LOG break if state == 1: if line[0] == b"\t": [tag, rev] = [x.strip() for x in line.split(b":")] taginfo[tag] = rev else: # oops. this line isn't tag info. stop parsing tags. state = 0 if state == 2: if line[0] == b"\t": [locker, rev] = [x.strip() for x in line.split(b":")] lockinfo[rev] = locker else: # oops. this line isn't lock info. stop parsing tags. state = 0 if state == 0: if line[:9] == b"RCS file:": filename = line[10:-1] elif line[:5] == b"head:": # head = line[6:-1] pass elif line[:7] == b"branch:": branch = line[8:-1] elif line[:6] == b"locks:": # start parsing the lock information state = 2 elif line[:14] == b"symbolic names": # start parsing the tag information state = 1 elif line == ENTRY_END_MARKER: # end of the headers break elif line == LOG_END_MARKER: # end of this file's log information eof = _EOF_FILE break else: error = _re_cvsnt_error.match(line) if error: p1, p2, p3, msg = error.groups() filename = p1 or p2 or p3 if not filename: raise ValueError( "Could not get filename from CVSNT error:\n%r" % line ) eof = _EOF_ERROR break error = _re_log_error.match(line) if error: filename, msg = error.groups() if msg[:30] == b"warning: Unknown phrases like ": # don't worry about this warning. it can happen with some RCS # files that have unknown fields in them e.g. "permissions 644;" continue eof = _EOF_ERROR break return filename, branch, taginfo, lockinfo, msg, eof _re_log_info = re.compile( rb"^date:\s+([^;]+);" rb"\s+author:\s+([^;]+);" rb"\s+state:\s+([^;]+);" rb"(\s+lines:\s+([0-9\s+-]+);?)?" rb"(\s+commitid:\s+([a-zA-Z0-9]+);)?\n$" ) # TODO: _re_rev should be updated to extract the "locked" flag _re_rev = re.compile(rb"^revision\s+([0-9.]+).*") def cvs_strptime(timestr): try: return time.strptime(timestr, "%Y/%m/%d %H:%M:%S")[:-1] + (0,) except ValueError: return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,) def _parse_log_entry(fp) -> Tuple[Optional[revtuple], Optional[bytes], Optional[bytes]]: """Parse a single log entry. On entry, fp should point to the first line of the entry (the "revision" line). On exit, fp will have consumed the log separator line (dashes) or the end-of-file marker (equals). Returns: Revision data tuple (number string, date, author, state, branches, revnumstr, commitid) if any, log, and eof flag (see _EOF_*) """ rev = None line = fp.readline() if not line: return None, None, _EOF_LOG if line == LOG_END_MARKER: # Needed because some versions of RCS precede LOG_END_MARKER # with ENTRY_END_MARKER return None, None, _EOF_FILE if line[:8] == b"revision": match = _re_rev.match(line) if not match: return None, None, _EOF_LOG rev = match.group(1) line = fp.readline() if not line: return None, None, _EOF_LOG match = _re_log_info.match(line) eof = None log = b"" while 1: line = fp.readline() if not line: # true end-of-file eof = _EOF_LOG break if line[:9] == b"branches:": continue if line == ENTRY_END_MARKER: break if line == LOG_END_MARKER: # end of this file's log information eof = _EOF_FILE break log = log + line if not rev or not match: # there was a parsing error return None, None, eof # parse out a time tuple for the local time tm = cvs_strptime(match.group(1).decode("UTF-8")) # rlog seems to assume that two-digit years are 1900-based (so, "04" # comes out as "1904", not "2004"). EPOCH = 1970 if tm[0] < EPOCH: tm = list(tm) if (tm[0] - 1900) < 70: tm[0] = tm[0] + 100 if tm[0] < EPOCH: raise ValueError("invalid year") date = calendar.timegm(tm) # return a revision tuple compatible with 'rcsparse', the log message, # and the EOF marker return ( revtuple( rev.decode("ascii"), # revision number string date, match.group(2), # author (encoding is arbitrary; don't attempt to decode) match.group(3).decode( "ascii" ), # state, usually "Exp" or "dead"; non-ASCII data here would be weird None, # TODO: branches of this rev None, # TODO: revnumstr of previous rev None, # TODO: commitid ), log, eof, ) diff --git a/swh/loader/cvs/tests/data/dino-readded-file.tgz b/swh/loader/cvs/tests/data/dino-readded-file.tgz new file mode 100644 index 0000000..a001aeb Binary files /dev/null and b/swh/loader/cvs/tests/data/dino-readded-file.tgz differ diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py index 747f110..91cbf99 100644 --- a/swh/loader/cvs/tests/test_loader.py +++ b/swh/loader/cvs/tests/test_loader.py @@ -1,537 +1,627 @@ # Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.loader.cvs.loader import CvsLoader from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) from swh.model.hashutil import hash_to_bytes from swh.model.model import Snapshot, SnapshotBranch, TargetType RUNBABY_SNAPSHOT = Snapshot( id=hash_to_bytes("1cff69ab9bd70822d5e3006092f943ccaafdcf57"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("ef511d258fa55035c2bc2a5b05cad233cee1d328"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_not_found_no_mock(swh_storage, tmp_path): """Given an unknown repository, the loader visit ends up in status not_found""" unknown_repo_url = "unknown-repository" loader = CvsLoader(swh_storage, unknown_repo_url, cvsroot_path=tmp_path) assert loader.load() == {"status": "uneventful"} assert_last_visit_matches( swh_storage, unknown_repo_url, status="not_found", type="cvs", ) def test_loader_cvs_visit(swh_storage, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "runbaby" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=RUNBABY_SNAPSHOT.id, ) stats = get_stats(loader.storage) assert stats == { "content": 5, "directory": 2, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } check_snapshot(RUNBABY_SNAPSHOT, loader.storage) def test_loader_cvs_2_visits_no_change(swh_storage, datadir, tmp_path): """Eventful visit followed by uneventful visit should yield the same snapshot """ archive_name = "runbaby" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} visit_status1 = assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=RUNBABY_SNAPSHOT.id, ) loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "uneventful"} visit_status2 = assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=RUNBABY_SNAPSHOT.id, ) assert visit_status1.date < visit_status2.date assert visit_status1.snapshot == visit_status2.snapshot stats = get_stats(loader.storage) assert stats["origin_visit"] == 1 + 1 # computed twice the same snapshot assert stats["snapshot"] == 1 GREEK_SNAPSHOT = Snapshot( id=hash_to_bytes("5e74af67d69dfd7aea0eb118154d062f71f50120"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("e18b92f14cd5b3efb3fcb4ea46cfaf97f25f301b"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_with_file_additions_and_deletions(swh_storage, datadir, tmp_path): """Eventful conversion of history with file additions and deletions""" archive_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_url += "/greek-tree" # CVS module name loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT.id, ) stats = get_stats(loader.storage) assert stats == { "content": 8, "directory": 20, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 7, } check_snapshot(GREEK_SNAPSHOT, loader.storage) def test_loader_cvs_pserver_with_file_additions_and_deletions( swh_storage, datadir, tmp_path ): """Eventful CVS pserver conversion with file additions and deletions""" archive_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_url += "/greek-tree" # CVS module name # Ask our cvsclient to connect via the 'cvs server' command repo_url = f"fake://{repo_url[7:]}" loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT.id, ) stats = get_stats(loader.storage) assert stats == { "content": 8, "directory": 20, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 7, } check_snapshot(GREEK_SNAPSHOT, loader.storage) GREEK_SNAPSHOT2 = Snapshot( id=hash_to_bytes("048885ae2145ffe81588aea95dcf75c536ecdf26"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("55eb1438c03588607ce4b8db8f45e8e23075951b"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_2_visits_with_change(swh_storage, datadir, tmp_path): """Eventful visit followed by eventful visit should yield two snapshots""" archive_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_url += "/greek-tree" # CVS module name loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} visit_status1 = assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT.id, ) stats = get_stats(loader.storage) assert stats == { "content": 8, "directory": 20, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 7, } archive_name2 = "greek-repository2" archive_path2 = os.path.join(datadir, f"{archive_name2}.tgz") repo_url = prepare_repository_from_archive(archive_path2, archive_name, tmp_path) repo_url += "/greek-tree" # CVS module name loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} visit_status2 = assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT2.id, ) stats = get_stats(loader.storage) assert stats == { "content": 10, "directory": 23, "origin": 1, "origin_visit": 2, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT2, loader.storage) assert visit_status1.date < visit_status2.date assert visit_status1.snapshot != visit_status2.snapshot def test_loader_cvs_visit_pserver(swh_storage, datadir, tmp_path): """Eventful visit to CVS pserver should yield 1 snapshot""" archive_name = "runbaby" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_url += "/runbaby" # CVS module name # Ask our cvsclient to connect via the 'cvs server' command repo_url = "fake://" + repo_url[7:] loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=RUNBABY_SNAPSHOT.id, ) stats = get_stats(loader.storage) assert stats == { "content": 5, "directory": 2, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } check_snapshot(RUNBABY_SNAPSHOT, loader.storage) GREEK_SNAPSHOT3 = Snapshot( id=hash_to_bytes("cd801546b0137c82f01b9b67848ba8261d64ebbb"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("14980990790ce1921db953c4c9ae03dd8861e8d6"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_visit_pserver_no_eol(swh_storage, datadir, tmp_path): """Visit to CVS pserver with file that lacks trailing eol""" archive_name = "greek-repository3" extracted_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, extracted_name, tmp_path) repo_url += "/greek-tree" # CVS module name # Ask our cvsclient to connect via the 'cvs server' command repo_url = "fake://" + repo_url[7:] loader = CvsLoader( - swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) + swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, extracted_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT3.id, ) stats = get_stats(loader.storage) assert stats == { "content": 9, "directory": 23, "origin": 1, "origin_visit": 1, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT3, loader.storage) GREEK_SNAPSHOT4 = Snapshot( id=hash_to_bytes("11673e2766654bd5fafb5119b418794230d48d6b"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("fe4a926d49d2af76e0025a8ba0b4ed159aec6829"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_visit_expand_id_keyword(swh_storage, datadir, tmp_path): """Visit to CVS repository with file with an RCS Id keyword""" archive_name = "greek-repository4" extracted_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, extracted_name, tmp_path) repo_url += "/greek-tree" # CVS module name loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, extracted_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT4.id, ) stats = get_stats(loader.storage) assert stats == { "content": 9, "directory": 22, "origin": 1, "origin_visit": 1, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT4, loader.storage) def test_loader_cvs_visit_pserver_expand_id_keyword(swh_storage, datadir, tmp_path): """Visit to CVS pserver with file with an RCS Id keyword""" archive_name = "greek-repository4" extracted_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, extracted_name, tmp_path) repo_url += "/greek-tree" # CVS module name # Ask our cvsclient to connect via the 'cvs server' command repo_url = f"fake://{repo_url[7:]}" loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, extracted_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT4.id, ) stats = get_stats(loader.storage) assert stats == { "content": 9, "directory": 22, "origin": 1, "origin_visit": 1, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT4, loader.storage) GREEK_SNAPSHOT5 = Snapshot( id=hash_to_bytes("ee6faeaf50aa513c53c8ba29194116a5ef88add6"), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes("4320f152cc61ed660d25fdeebc787b3099e55a96"), target_type=TargetType.REVISION, ) }, ) def test_loader_cvs_with_file_deleted_and_readded(swh_storage, datadir, tmp_path): """Eventful conversion of history with file deletion and re-addition""" archive_name = "greek-repository5" extracted_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, extracted_name, tmp_path) repo_url += "/greek-tree" # CVS module name loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, extracted_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT5.id, ) stats = get_stats(loader.storage) assert stats == { "content": 9, "directory": 22, "origin": 1, "origin_visit": 1, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT5, loader.storage) def test_loader_cvs_pserver_with_file_deleted_and_readded( swh_storage, datadir, tmp_path ): """Eventful pserver conversion with file deletion and re-addition""" archive_name = "greek-repository5" extracted_name = "greek-repository" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, extracted_name, tmp_path) repo_url += "/greek-tree" # CVS module name # Ask our cvsclient to connect via the 'cvs server' command repo_url = f"fake://{repo_url[7:]}" loader = CvsLoader( swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, extracted_name) ) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( loader.storage, repo_url, status="full", type="cvs", snapshot=GREEK_SNAPSHOT5.id, ) stats = get_stats(loader.storage) assert stats == { "content": 9, "directory": 22, "origin": 1, "origin_visit": 1, "release": 0, "revision": 8, "skipped_content": 0, "snapshot": 8, } check_snapshot(GREEK_SNAPSHOT5, loader.storage) + + +DINO_SNAPSHOT = Snapshot( + id=hash_to_bytes("417021c16e17c5e0038cf0e73dbf48a6142c8304"), + branches={ + b"HEAD": SnapshotBranch( + target=hash_to_bytes("df61a776c401a178cc796545849fc87bdadb2001"), + target_type=TargetType.REVISION, + ) + }, +) + + +def test_loader_cvs_readded_file_in_attic(swh_storage, datadir, tmp_path): + """Conversion of history with RCS files in the Attic""" + # This repository has some file revisions marked "dead" in the Attic only. + # This is different to the re-added file tests above, where the RCS file + # was moved out of the Attic again as soon as the corresponding deleted + # file was re-added. Failure to detect the "dead" file revisions in the + # Attic would result in errors in our converted history. + archive_name = "dino-readded-file" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + repo_url += "/src" # CVS module name + + loader = CvsLoader( + swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) + ) + + assert loader.load() == {"status": "eventful"} + + assert_last_visit_matches( + loader.storage, repo_url, status="full", type="cvs", snapshot=DINO_SNAPSHOT.id, + ) + + stats = get_stats(loader.storage) + assert stats == { + "content": 38, + "directory": 105, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 35, + "skipped_content": 0, + "snapshot": 35, + } + + check_snapshot(DINO_SNAPSHOT, loader.storage) + + +def test_loader_cvs_pserver_readded_file_in_attic(swh_storage, datadir, tmp_path): + """Conversion over pserver with RCS files in the Attic""" + # This repository has some file revisions marked "dead" in the Attic only. + # This is different to the re-added file tests above, where the RCS file + # was moved out of the Attic again as soon as the corresponding deleted + # file was re-added. Failure to detect the "dead" file revisions in the + # Attic would result in errors in our converted history. + # This has special implications for the pserver case, because the "dead" + # revisions will not appear in in the output of 'cvs rlog' by default. + archive_name = "dino-readded-file" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + repo_url += "/src" # CVS module name + + # Ask our cvsclient to connect via the 'cvs server' command + repo_url = f"fake://{repo_url[7:]}" + + loader = CvsLoader( + swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) + ) + + assert loader.load() == {"status": "eventful"} + + assert_last_visit_matches( + loader.storage, repo_url, status="full", type="cvs", snapshot=DINO_SNAPSHOT.id, + ) + + stats = get_stats(loader.storage) + assert stats == { + "content": 38, + "directory": 105, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 35, + "skipped_content": 0, + "snapshot": 35, + } + + check_snapshot(DINO_SNAPSHOT, loader.storage)