diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -54,6 +54,10 @@ TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs." +class BadPathException(Exception): + pass + + class CvsLoader(BaseLoader): """Swh cvs loader. @@ -83,6 +87,7 @@ cvsroot_path: Optional[str] = None, temp_directory: str = "/tmp", max_content_size: Optional[int] = None, + rlog_file_override: Optional[str] = None, ): super().__init__( storage=storage, @@ -106,6 +111,8 @@ self.cvsroot_path = cvsroot_path self.custom_id_keyword = None self.excluded_keywords: List[str] = [] + # for use by tests which poke at our rlog parser: + self.rlog_file_override = rlog_file_override self.snapshot: Optional[Snapshot] = None self.last_snapshot: Optional[Snapshot] = snapshot_get_latest( @@ -142,6 +149,12 @@ assert self.server_style_cvsroot path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.tempdir_path, path) + # The file path must be a child of our temporary directory. + if ( + os.path.commonpath([self.tempdir_path, os.path.normpath(wtpath)]) + != self.tempdir_path + ): + raise BadPathException("weird path found in RCS file: %s" % f.path) self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) if f.state == "dead": # remove this file from work tree @@ -189,6 +202,13 @@ assert self.cvsroot_path path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.tempdir_path, path) + # The file path must be a child of our temporary directory. + if ( + "%s..%s" % (os.path.sep, os.path.sep) in wtpath + or os.path.commonpath([self.tempdir_path, os.path.normpath(wtpath)]) + != self.tempdir_path + ): + raise BadPathException("weird path found in cvs rlog output: %s" % f.path) self.log.info("rev %s state %s file %s" % (f.rev, f.state, f.path)) if f.state == "dead": # remove this file from work tree @@ -459,7 +479,11 @@ self.cvs_module_name, ) self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) - main_rlog_file = self.cvsclient.fetch_rlog() + if self.rlog_file_override is not None: # used for testing purposes only + main_rlog_file = open(self.rlog_file_override, "rb") + else: + main_rlog_file = self.cvsclient.fetch_rlog() + self.log.info("parse rlog %s" % main_rlog_file) self.rlog.parse_rlog(main_rlog_file) # Find file deletion events only visible in Attic directories. main_changesets = self.rlog.changesets @@ -481,12 +505,13 @@ if attic_path in attic_paths: continue attic_paths.append(attic_path) # avoid multiple visits - # Try to fetch more rlog data from this Attic directory. - attic_rlog_file = self.cvsclient.fetch_rlog( - path=attic_path, state="dead", - ) - if attic_rlog_file: - attic_rlog_files.append(attic_rlog_file) + if self.rlog_file_override is None: + # Try to fetch more rlog data from this Attic directory. + attic_rlog_file = self.cvsclient.fetch_rlog( + path=attic_path, state="dead", + ) + if attic_rlog_file: + attic_rlog_files.append(attic_rlog_file) if len(attic_rlog_files) == 0: self.rlog_file = main_rlog_file else: @@ -528,6 +553,7 @@ return False except Exception: self.log.exception("Exception in fetch_data:") + self._visit_status = "failed" return False # Stopping iteration self._contents, self._skipped_contents, self._directories, rev = data self._revisions = [rev] @@ -602,8 +628,9 @@ self._revisions = [] def load_status(self) -> Dict[str, Any]: - assert self.snapshot is not None - if self.last_snapshot == self.snapshot: + if self.snapshot is None: + load_status = "failed" + elif self.last_snapshot == self.snapshot: load_status = "uneventful" else: load_status = "eventful" diff --git a/swh/loader/cvs/tests/data/weird-paths.rlog b/swh/loader/cvs/tests/data/weird-paths.rlog new file mode 100644 --- /dev/null +++ b/swh/loader/cvs/tests/data/weird-paths.rlog @@ -0,0 +1,122 @@ +RCS file: {cvsroot_path}/../greek-tree/alpha,v +head: 1.2 +branch: +locks: strict +access list: +symbolic names: + start: 1.1.1.1 + yoyo: 1.1.1 +keyword substitution: kv +total revisions: 3; selected revisions: 3 +description: +---------------------------- +revision 1.2 +date: 2021-04-20 15:30:37 +0200; author: stsp; state: Exp; lines: +1 -0; commitid: 100607ED77A971503F5; +edit alpha +---------------------------- +revision 1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; commitid: 100607ED74996F4C8AF; +branches: 1.1.1; +Initial revision +---------------------------- +revision 1.1.1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; lines: +0 -0; commitid: 100607ED74996F4C8AF; +initial import +============================================================================= + +RCS file: {cvsroot_path}/greek-tree/Attic/../beta,v +head: 1.2 +branch: +locks: strict +access list: +symbolic names: + start: 1.1.1.1 + yoyo: 1.1.1 +keyword substitution: kv +total revisions: 3; selected revisions: 3 +description: +---------------------------- +revision 1.2 +date: 2021-04-20 15:30:52 +0200; author: stsp; state: dead; lines: +0 -0; commitid: 100607ED78A9726BA11; +remove beta +---------------------------- +revision 1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; commitid: 100607ED74996F4C8AF; +branches: 1.1.1; +Initial revision +---------------------------- +revision 1.1.1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; lines: +0 -0; commitid: 100607ED74996F4C8AF; +initial import +============================================================================= + +RCS file: /etc/passwd +head: 1.2 +branch: +locks: strict +access list: +symbolic names: +keyword substitution: kv +total revisions: 2; selected revisions: 2 +description: +---------------------------- +revision 1.2 +date: 2021-04-20 15:32:18 +0200; author: stsp; state: Exp; lines: +1 -0; commitid: 100607ED7DF9763EBB7; +edit psi +---------------------------- +revision 1.1 +date: 2021-04-20 15:31:15 +0200; author: stsp; state: Exp; commitid: 100607ED7999735979A; +add epsilon/psi +============================================================================= + +RCS file: {cvsroot_path}/../../etc/passwd +head: 1.3 +branch: +locks: strict +access list: +symbolic names: + start: 1.1.1.1 + yoyo: 1.1.1 +keyword substitution: kv +total revisions: 4; selected revisions: 4 +description: +---------------------------- +revision 1.3 +date: 2021-04-20 15:32:45 +0200; author: stsp; state: Exp; lines: +1 -1; commitid: 100607ED7F29770C997; +reviving zeta +---------------------------- +revision 1.2 +date: 2021-04-20 15:31:57 +0200; author: stsp; state: dead; lines: +0 -0; commitid: 100607ED7C89753114E; +remove epsilon/zeta +---------------------------- +revision 1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; commitid: 100607ED74996F4C8AF; +branches: 1.1.1; +Initial revision +---------------------------- +revision 1.1.1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; lines: +0 -0; commitid: 100607ED74996F4C8AF; +initial import +============================================================================= + +RCS file: {cvsroot_path}/greek-tree/gamma/../../../../../../etc/passwd +head: 1.1 +branch: 1.1.1 +locks: strict +access list: +symbolic names: + start: 1.1.1.1 + yoyo: 1.1.1 +keyword substitution: kv +total revisions: 2; selected revisions: 2 +description: +---------------------------- +revision 1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; commitid: 100607ED74996F4C8AF; +branches: 1.1.1; +Initial revision +---------------------------- +revision 1.1.1.1 +date: 2021-04-20 15:29:48 +0200; author: stsp; state: Exp; lines: +0 -0; commitid: 100607ED74996F4C8AF; +initial import +============================================================================= diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py --- a/swh/loader/cvs/tests/test_loader.py +++ b/swh/loader/cvs/tests/test_loader.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import os +import tempfile from typing import Any, Dict from swh.loader.cvs.loader import CvsLoader @@ -1081,3 +1082,46 @@ "skipped_content": 0, "snapshot": 1, } + + +def test_loader_cvs_weird_paths_in_rlog(swh_storage, datadir, tmp_path): + """Handle cvs rlog output which contains weird paths""" + archive_name = "greek-repository" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + repo_url += "/greek-tree" # CVS module name + + # Ask our cvsclient to connect via the 'cvs server' command + repo_url = f"fake://{repo_url[7:]}" + + # And let's pretend the server returned this rlog output instead of + # what it would actually return. + rlog_file_override = tempfile.NamedTemporaryFile( + dir=tmp_path, mode="w+", delete=False, prefix="weird-path-rlog-" + ) + rlog_file_override_path = rlog_file_override.name + + rlog_weird_paths = open(os.path.join(datadir, "weird-paths.rlog")) + for line in rlog_weird_paths.readlines(): + rlog_file_override.write( + line.replace("{cvsroot_path}", os.path.dirname(repo_url[7:])) + ) + rlog_file_override.close() + + try: + loader = CvsLoader( + swh_storage, + repo_url, + cvsroot_path=os.path.join(tmp_path, archive_name), + rlog_file_override=rlog_file_override_path, + ) + except CvsLoader.BadPathException: + pass + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + swh_storage, repo_url, status="failed", type="cvs", + ) + + os.unlink(rlog_file_override_path)