diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py --- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py +++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py @@ -478,7 +478,7 @@ def git_dump_file(path: str, k, rcs, markseq) -> None: try: - cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k) + cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k, []) except RuntimeError as msg: print('Unexpected runtime error on parsing', path, k, ':', msg, file=sys.stderr) @@ -566,7 +566,7 @@ fl |= self.RCS_KWEXP_ERR return fl - def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str) -> bytes: + def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str, excluded_keywords: List[str]) -> bytes: """ Check out a file with keywords expanded. Expansion rules are specific to each keyword, and some cases specific to undocumented behaviour of CVS. @@ -601,11 +601,25 @@ break prefix = line[:m.start(1) - 1] next_match_segment = copy.deepcopy(line[dsign:]) - line = line[dsign + 1:] expbuf = '' + try: + kwname = m.group(1).decode('ascii') + except UnicodeDecodeError: + # Not a valid RCS keyword, use it as it is + ret.append(line) + break + if kwname in excluded_keywords: + line0 += prefix + m.group(1) + m = self.re_kw.match(next_match_segment) + if m: + line = next_match_segment + continue + else: + ret.append(line0 + line[dsign + 1:]) + break + line = line[dsign + 1:] if (mode & self.RCS_KWEXP_NAME) != 0: - expbuf += '$' - expbuf += m.group(1).decode('ascii') + expbuf += '$%s' % kwname if (mode & self.RCS_KWEXP_VAL) != 0: expbuf += ': ' if (mode & self.RCS_KWEXP_VAL) != 0: diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -104,6 +104,8 @@ self._visit_status = "full" self.visit_date = visit_date self.cvsroot_path = cvsroot_path + self.custom_id_keyword = None + self.excluded_keywords: List[str] = [] self.snapshot: Optional[Snapshot] = None self.last_snapshot: Optional[Snapshot] = snapshot_get_latest( @@ -171,7 +173,11 @@ if server_style_path[0] != "/": server_style_path = "/" + server_style_path - contents = rcs.expand_keyword(server_style_path, rcsfile, f.rev) + if self.custom_id_keyword is not None: + rcs.add_id_keyword(self.custom_id_keyword) + contents = rcs.expand_keyword( + server_style_path, rcsfile, f.rev, self.excluded_keywords + ) os.makedirs(os.path.dirname(wtpath), exist_ok=True) outfile = open(wtpath, mode="wb") outfile.write(contents) @@ -266,6 +272,47 @@ def cleanup(self) -> None: self.log.info("cleanup") + def configure_custom_id_keyword(self, cvsconfig): + """Parse CVSROOT/config and look for a custom keyword definition. + There are two different configuration directives in use for this purpose. + + The first variant stems from a patch which was never accepted into + upstream CVS and uses the tag directive: tag=MyName + With this, the "MyName" keyword becomes an alias for the "Id" keyword. + This variant is prelevant in CVS versions shipped on BSD. + + The second variant stems from upstream CVS 1.12 and looks like: + LocalKeyword=MyName=SomeKeyword + KeywordExpand=iMyName + We only support "SomeKeyword" if it specifies "Id" or "CVSHeader", for now. + The KeywordExpand directive can be used to suppress expansion of keywords + by listing keywords after an initial "e" character ("exclude", as opposed + to an "include" list which uses an initial "i" character). + For example, this disables expansion of the Date and Name keywords: + KeywordExpand=eDate,Name + """ + for line in cvsconfig.readlines(): + line = line.strip() + try: + (config_key, value) = line.split("=", 1) + except ValueError: + continue + config_key = config_key.strip() + value = value.strip() + if config_key == "tag": + self.custom_id_keyword = value + elif config_key == "LocalKeyword": + try: + (custom_kwname, kwname) = value.split("=", 1) + except ValueError: + continue + if kwname.strip() in ("Id", "CVSHeader"): + self.custom_id_keyword = custom_kwname.strip() + elif config_key == "KeywordExpand" and value.startswith("e"): + excluded_keywords = value[1:].split(",") + for k in excluded_keywords: + self.excluded_keywords.append(k.strip()) + def fetch_cvs_repo_with_rsync(self, host: str, path: str) -> None: # URL *must* end with a trailing slash in order to get CVSROOT listed url = "rsync://%s%s/" % (host, os.path.dirname(path)) @@ -288,12 +335,17 @@ if not have_cvsroot: raise NotFound("No CVSROOT directory found at %s" % url) + # Fetch the CVSROOT directory and the desired CVS module. assert self.cvsroot_path - subprocess.run( - # Ensure that rsync will place files directly within our cvsroot - # directory by appending a "/" to our cvsroot path. - ["rsync", "-a", url, self.cvsroot_path + "/"] - ).check_returncode() + for d in ("CVSROOT", self.cvs_module_name): + target_dir = os.path.join(self.cvsroot_path, d) + os.makedirs(target_dir, exist_ok=True) + subprocess.run( + # Append trailing path separators ("/" in the URL and os.path.sep in the + # local target directory path) to ensure that rsync will place files + # directly within our target directory . + ["rsync", "-a", url + d + "/", target_dir + os.path.sep] + ).check_returncode() def prepare(self) -> None: self._last_revision = None @@ -361,6 +413,14 @@ self.cvsroot_path, ) + # The file CVSROOT/config will usually contain ASCII data only. + # We allow UTF-8 just in case. Other encodings may result in an + # error and will require manual intervention, for now. + cvsconfig_path = os.path.join(self.cvsroot_path, "CVSROOT", "config") + cvsconfig = open(cvsconfig_path, mode="r", encoding="utf-8") + self.configure_custom_id_keyword(cvsconfig) + cvsconfig.close() + # Unfortunately, there is no way to convert CVS history in an # iterative fashion because the data is not indexed by any kind # of changeset ID. We need to walk the history of each and every diff --git a/swh/loader/cvs/tests/data/greek-repository9.tgz b/swh/loader/cvs/tests/data/greek-repository9.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@