diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py --- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py +++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py @@ -248,7 +248,7 @@ class FileRevision: - def __init__(self, path: str, rev: str, state: str, markseq: int) -> None: + def __init__(self, path: bytes, rev: str, state: str, markseq: int) -> None: self.path = path self.rev = rev self.state = state @@ -336,7 +336,7 @@ def __hash__(self) -> int: return hash(self.branch + '/' + self.author) * 31 + self.log_hash - def put_file(self, path: str, rev: str, state: str, markseq: int): + def put_file(self, path: bytes, rev: str, state: str, markseq: int): self.revs.append(FileRevision(path, rev, state, markseq)) @@ -363,19 +363,19 @@ p.append(module) path = os.path.join(*p) - for root, dirs, files in os.walk(path): - if '.git' in dirs: + for root, dirs, files in os.walk(os.fsencode(path)): + if b'.git' in dirs: print('Ignore %s: cannot handle the path named \'.git\'' % ( - root + os.sep + '.git'), file=sys.stderr) - dirs.remove('.git') - if '.git' in files: + os.path.join(root, b'.git')), file=sys.stderr) + dirs.remove(b'.git') + if b'.git' in files: print('Ignore %s: cannot handle the path named \'.git\'' % ( - root + os.sep + '.git'), file=sys.stderr) - files.remove('.git') + os.path.join(root, b'.git')), file=sys.stderr) + files.remove(b'.git') for f in files: - if not f[-2:] == ',v': + if not f[-2:] == b',v': continue - self.parse_file(root + os.sep + f) + self.parse_file(os.path.join(root, f)) for t, c in list(self.tags.items()): c.tags.append(t) @@ -461,16 +461,16 @@ self.tags[t] = a -def file_path(r: str, p: str) -> str: - if r.endswith('/'): +def file_path(r: bytes, p: bytes) -> bytes: + if r.endswith(b'/'): r = r[:-1] - if p[-2:] == ',v': + if p[-2:] == b',v': path = p[:-2] # drop ",v" else: path = p - p_ = path.split('/') - if len(p_) > 0 and p_[-2] == 'Attic': - path = '/'.join(p_[:-2] + [p_[-1]]) + p_ = path.split(b'/') + if len(p_) > 0 and p_[-2] == b'Attic': + path = b'/'.join(p_[:-2] + [p_[-1]]) if path.startswith(r): path = path[len(r) + 1:] return path @@ -566,7 +566,7 @@ fl |= self.RCS_KWEXP_ERR return fl - def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str, excluded_keywords: List[str]) -> bytes: + def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str, excluded_keywords: List[str], filename_encoding="utf-8") -> bytes: """ Check out a file with keywords expanded. Expansion rules are specific to each keyword, and some cases specific to undocumented behaviour of CVS. @@ -698,9 +698,9 @@ if (mode & self.RCS_KWEXP_NAME) != 0: expbuf += '$' if logbuf is not None: - ret.append(prefix + expbuf.encode('ascii') + b'\n' + logbuf) + ret.append(prefix + expbuf.encode(filename_encoding) + b'\n' + logbuf) else: - line0 += prefix + expbuf[:255].encode('ascii') + line0 += prefix + expbuf[:255].encode(filename_encoding) m = self.re_kw.match(next_match_segment) if m: line = next_match_segment diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py --- a/swh/loader/cvs/cvsclient.py +++ b/swh/loader/cvs/cvsclient.py @@ -11,6 +11,7 @@ import socket import subprocess import tempfile +from typing import Tuple from tenacity import retry from tenacity.retry import retry_if_exception_type @@ -85,6 +86,28 @@ return "".join(s) +def decode_path(path: bytes) -> Tuple[str, str]: + """Attempt to decode a file path based on encodings known to be used + in CVS repositories that can be found in the wild. + + Args: + path: raw bytes path + + Returns: + A tuple (decoded path, encoding) + + """ + path_encodings = ["ascii", "iso-8859-1", "utf-8"] + for encoding in path_encodings: + try: + how = "ignore" if encoding == path_encodings[-1] else "strict" + path_str = path.decode(encoding, how) + break + except UnicodeError: + pass + return path_str, encoding + + class CVSProtocolError(Exception): pass @@ -207,8 +230,8 @@ return self.ssh.stdin.flush() raise Exception("No valid connection") - def conn_write_str(self, s): - return self.conn_write(s.encode("UTF-8")) + def conn_write_str(self, s, encoding="utf-8"): + return self.conn_write(s.encode(encoding)) def conn_close(self): if self.socket: @@ -292,8 +315,12 @@ rlog_output.seek(0) return rlog_output - def fetch_rlog(self, path="", state=""): - path_arg = path or self.cvs_module_name + def fetch_rlog(self, path: bytes = b"", state=""): + if path: + path_arg, encoding = decode_path(path) + else: + path_arg, encoding = self.cvs_module_name, "utf-8" + if len(state) > 0: state_arg = "Argument -s%s\n" % state else: @@ -304,7 +331,8 @@ f"{state_arg}" "Argument --\n" f"Argument {path_arg}\n" - "rlog\n" + "rlog\n", + encoding=encoding, ) while True: response = self.conn_read_line() @@ -325,7 +353,7 @@ fp.seek(0) return self._parse_rlog_response(fp) - def checkout(self, path, rev, dest_dir, expand_keywords): + def checkout(self, path: bytes, rev: str, dest_dir: bytes, expand_keywords: bool): """ Download a file revision from the cvs server and store the file's contents in a temporary file. If expand_keywords is @@ -342,15 +370,18 @@ expect_bytecount = False have_bytecount = False bytecount = 0 - dirname = os.path.dirname(path) + + path_str, encoding = decode_path(path) + + dirname = os.path.dirname(path_str) if dirname: self.conn_write_str( "Directory %s\n%s\n" % (dirname, os.path.join(self.cvsroot_path, dirname)) ) - filename = os.path.basename(path) + filename = os.path.basename(path_str) co_output = tempfile.NamedTemporaryFile( - dir=dest_dir, + dir=os.fsdecode(dest_dir), delete=True, prefix="cvsclient-checkout-%s-r%s-" % (filename, rev), ) @@ -372,8 +403,9 @@ os.path.join(self.cvsroot_path, self.cvs_module_name), rev, karg, - path, - ) + path_str, + ), + encoding=encoding, ) while True: if have_bytecount: diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -31,7 +31,7 @@ RcsKeywords, file_path, ) -from swh.loader.cvs.cvsclient import CVSClient +from swh.loader.cvs.cvsclient import CVSClient, decode_path import swh.loader.cvs.rcsparse as rcsparse from swh.loader.cvs.rlog import RlogConv from swh.loader.exception import NotFound @@ -146,15 +146,13 @@ self._last_revision = revision return (revision, swh_dir) - def file_path_is_safe(self, wtpath): - if "%s..%s" % (os.path.sep, os.path.sep) in wtpath: + def file_path_is_safe(self, wtpath: bytes): + tempdir = os.fsencode(self.tempdir_path) + if os.fsencode("%s..%s" % (os.path.sep, os.path.sep)) in wtpath: # Paths with back-references should not appear # in CVS protocol messages or CVS rlog output return False - elif ( - os.path.commonpath([self.tempdir_path, os.path.normpath(wtpath)]) - != self.tempdir_path - ): + elif os.path.commonpath([tempdir, os.path.normpath(wtpath)]) != tempdir: # The path must be a child of our temporary directory. return False else: @@ -165,10 +163,10 @@ ) -> None: assert self.cvsroot_path assert self.server_style_cvsroot - path = file_path(self.cvsroot_path, f.path) - wtpath = os.path.join(self.tempdir_path, path) + path = file_path(os.fsencode(self.cvsroot_path), f.path) + wtpath = os.path.join(os.fsencode(self.tempdir_path), path) if not self.file_path_is_safe(wtpath): - raise BadPathException(f"unsafe path found in RCS file: {f.path}") + raise BadPathException(f"unsafe path found in RCS file: {f.path!r}") self.log.debug("rev %s state %s file %s", f.rev, f.state, f.path) if f.state == "dead": # remove this file from work tree @@ -194,7 +192,10 @@ # a distinct origin, but will hopefully point at the same SWH snapshot. # In any case, an absolute path based on the origin URL looks nicer than # an absolute path based on a temporary directory used by the CVS loader. - server_style_path = f.path.replace( + + path_str, encoding = decode_path(f.path) + + server_style_path = path_str.replace( self.cvsroot_path, self.server_style_cvsroot ) if server_style_path[0] != "/": @@ -203,7 +204,11 @@ if self.custom_id_keyword is not None: rcs.add_id_keyword(self.custom_id_keyword) contents = rcs.expand_keyword( - server_style_path, rcsfile, f.rev, self.excluded_keywords + server_style_path, + rcsfile, + f.rev, + self.excluded_keywords, + filename_encoding=encoding, ) os.makedirs(os.path.dirname(wtpath), exist_ok=True) outfile = open(wtpath, mode="wb") @@ -214,10 +219,10 @@ self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient ): assert self.cvsroot_path - path = file_path(self.cvsroot_path, f.path) - wtpath = os.path.join(self.tempdir_path, path) + path = file_path(os.fsencode(self.cvsroot_path), f.path) + wtpath = os.path.join(os.fsencode(self.tempdir_path), path) if not self.file_path_is_safe(wtpath): - raise BadPathException(f"unsafe path found in cvs rlog output: {f.path}") + raise BadPathException(f"unsafe path found in cvs rlog output: {f.path!r}") self.log.debug("rev %s state %s file %s", f.rev, f.state, f.path) if f.state == "dead": # remove this file from work tree @@ -416,14 +421,14 @@ have_rcsfile = False have_cvsroot = False - for root, dirs, files in os.walk(self.cvsroot_path): - if "CVSROOT" in dirs: + for root, dirs, files in os.walk(os.fsencode(self.cvsroot_path)): + if b"CVSROOT" in dirs: have_cvsroot = True - dirs.remove("CVSROOT") + dirs.remove(b"CVSROOT") continue for f in files: filepath = os.path.join(root, f) - if f[-2:] == ",v": + if f[-2:] == b",v": rcsfile = rcsparse.rcsfile(filepath) # noqa: F841 self.log.debug( "Looks like we have data to convert; " @@ -500,18 +505,19 @@ attic_paths = [] attic_rlog_files = [] assert self.cvsroot_path + cvsroot_path_bytes = os.fsencode(self.cvsroot_path) for k in main_changesets: for changed_file in k.revs: - path = file_path(self.cvsroot_path, changed_file.path) - if path.startswith(self.cvsroot_path): + path = file_path(cvsroot_path_bytes, changed_file.path) + if path.startswith(cvsroot_path_bytes): path = path[ - len(os.path.commonpath([self.cvsroot_path, path])) + 1 : + len(os.path.commonpath([cvsroot_path_bytes, path])) + 1 : ] parent_path = os.path.dirname(path) - if parent_path.split("/")[-1] == "Attic": + if parent_path.split(b"/")[-1] == b"Attic": continue - attic_path = parent_path + "/Attic" + attic_path = parent_path + b"/Attic" if attic_path in attic_paths: continue attic_paths.append(attic_path) # avoid multiple visits diff --git a/swh/loader/cvs/rcsparse.pyi b/swh/loader/cvs/rcsparse.pyi --- a/swh/loader/cvs/rcsparse.pyi +++ b/swh/loader/cvs/rcsparse.pyi @@ -20,7 +20,7 @@ revs: Mapping[str, Tuple[str, int, str, str, List[str], str, str]] # actually rcsparse.rcsrevtree desc: str - def __init__(self, path: str): ... + def __init__(self, path: bytes): ... def checkout(self, rev: str = "HEAD") -> bytes: ... def getlog(self, rev: str) -> bytes: ... diff --git a/swh/loader/cvs/rcsparse/py-rcsparse.c b/swh/loader/cvs/rcsparse/py-rcsparse.c --- a/swh/loader/cvs/rcsparse/py-rcsparse.c +++ b/swh/loader/cvs/rcsparse/py-rcsparse.c @@ -726,8 +726,9 @@ pyrcsfile_init(struct pyrcsfile *pyrcs, PyObject *args) { const char *filename; + Py_ssize_t length; - if (!PyArg_ParseTuple(args, "s", &filename)) + if (!PyArg_ParseTuple(args, "s#", &filename, &length)) return -1; pyrcs->rcs = rcsopen(filename); diff --git a/swh/loader/cvs/rlog.py b/swh/loader/cvs/rlog.py --- a/swh/loader/cvs/rlog.py +++ b/swh/loader/cvs/rlog.py @@ -52,21 +52,6 @@ from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey -# There is no known encoding of path names in CVS. The actual encoding used -# will depend on the CVS server's operating system and perhaps even the -# underlying filesystem used to host a CVS repository. -# It is even conceivable that a given repository may use multiple encodings, -# e.g. due to migrations of the repository between different servers over time. -# -# This issue also affects the CVS network protocol which is communicating -# paths between the CVS server and the CVS client. For this reason, most -# public-facing repositories should stick to ASCII in practice. -# -# TODO: If known, the actual path encoding used by the repository should -# be specified as a parameter. This parameter should be a list since -# multiple encodings may be present in a given repository. -path_encodings = ["ascii", "utf-8"] - class revtuple(NamedTuple): number: str @@ -84,11 +69,11 @@ self.fuzzsec = fuzzsec self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() self.tags: Dict[str, ChangeSetKey] = dict() - self.offsets: Dict[str, Dict[str, int]] = dict() + self.offsets: Dict[bytes, Dict[str, int]] = dict() def _process_rlog_revisions( self, - path: str, + path: bytes, taginfo: Dict[bytes, bytes], revisions: Dict[str, revtuple], logmsgs: Dict[str, Optional[bytes]], @@ -227,20 +212,9 @@ filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp) revisions: Dict[str, revtuple] = {} logmsgs: Dict[str, Optional[bytes]] = {} - path = "" + path = b"" if filename: - # There is no known encoding of filenames in CVS. - # Attempt to decode the path with our list of known encodings. - # If none of them work, forcefully decode the path assuming - # the final path encoding provided in the list. - for i, e in enumerate(path_encodings): - try: - how = "ignore" if i == len(path_encodings) - 1 else "strict" - fname = filename.decode(e, how) - break - except UnicodeError: - pass - path = fname + path = filename elif not eof: raise ValueError("No filename found in rlog header") while not eof: @@ -257,7 +231,7 @@ self._process_rlog_revisions(path, taginfo, revisions, logmsgs) - def getlog(self, fp: BinaryIO, path: str, rev: str) -> Optional[bytes]: + def getlog(self, fp: BinaryIO, path: bytes, rev: str) -> Optional[bytes]: off = self.offsets[path][rev] fp.seek(off) _rev, logmsg, eof = _parse_log_entry(fp) diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py --- a/swh/loader/cvs/tests/test_loader.py +++ b/swh/loader/cvs/tests/test_loader.py @@ -1257,3 +1257,34 @@ # check cvs client can be instantiated without errors CVSClient(parsed_url) + + +@pytest.mark.parametrize("protocol", ["rsync", "pserver"]) +def test_loader_cvs_with_non_utf8_directory_paths( + swh_storage, datadir, tmp_path, protocol +): + archive_name = "greek-repository" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + repo_url += "/greek-tree" # CVS module name + + protocol_prefix = "file://" + if protocol == "pserver": + protocol_prefix = "fake://" + repo_url = repo_url.replace("file://", protocol_prefix) + + for root, _, files in os.walk(repo_url.replace(protocol_prefix, "")): + for file in files: + # clone existing file in repository but makes it path non UTF-8 encoded + filepath = os.path.join(root, file) + with open(filepath, "rb") as f: + filecontent = f.read() + filepath = root.encode() + ("é" + file).encode("iso-8859-1") + with open(filepath, "wb") as f: + f.write(filecontent) + + loader = CvsLoader( + swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name) + ) + + assert loader.load() == {"status": "eventful"}