diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -10,6 +10,7 @@ from datetime import datetime from mmap import ACCESS_WRITE, mmap import os +import pathlib import pty import re import shutil @@ -101,7 +102,7 @@ self.debug = debug self.temp_directory = temp_directory self.done = False - self.svnrepo = None + self.svnrepo: Optional[SvnRepo] = None self.skip_post_load = False # Revision check is configurable self.check_revision = check_revision @@ -112,15 +113,15 @@ self._revisions: List[Revision] = [] self._snapshot: Optional[Snapshot] = None # internal state, current visit - self._last_revision = None + self._last_revision: Optional[Revision] = None self._visit_status = "full" self._load_status = "uneventful" self.visit_date = visit_date self.incremental = incremental self.snapshot: Optional[Snapshot] = None # state from previous visit - self.latest_snapshot = None - self.latest_revision = None + self.latest_snapshot: Optional[Snapshot] = None + self.latest_revision: Optional[Revision] = None def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed @@ -160,7 +161,7 @@ """ assert self.svnrepo is not None local_dirname, local_url = self.svnrepo.export_temporary(revision) - h = from_disk.Directory.from_disk(path=local_url).hash + h = from_disk.Directory.from_disk(path=bytes(local_url)).hash self.svnrepo.clean_fs(local_dirname) return h @@ -391,7 +392,7 @@ def prepare_origin_visit(self): self.origin = Origin(url=self.origin_url if self.origin_url else self.svn_url) - def prepare(self): + def prepare(self) -> None: latest_snapshot_revision = self._latest_snapshot_revision(self.origin_url) if latest_snapshot_revision: self.latest_snapshot, self.latest_revision = latest_snapshot_revision @@ -402,7 +403,7 @@ try: self.svnrepo = SvnRepo( - self.svn_url, self.origin_url, local_dirname, self.max_content_size + self.svn_url, self.origin_url, local_dirname, self.max_content_size or 0 ) except SubversionException as e: error_msgs = [ @@ -458,6 +459,7 @@ self.done = True # Stopping iteration self._visit_status = "full" except Exception as e: # svn:external, hash divergence, i/o error... + raise # FIXME: remove this self.log.exception(e) self.done = True # Stopping iteration self._visit_status = "partial" @@ -546,11 +548,13 @@ self._last_revision.directory, ) - def _create_tmp_dir(self, root_tmp_dir: str) -> str: - return tempfile.mkdtemp( - dir=root_tmp_dir, - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - suffix="-%s" % os.getpid(), + def _create_tmp_dir(self, root_tmp_dir: str) -> pathlib.Path: + return pathlib.Path( + tempfile.mkdtemp( + dir=root_tmp_dir, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix="-%s" % os.getpid(), + ) ) @@ -564,7 +568,7 @@ self, storage: StorageInterface, url: str, - archive_path: str, + archive_path: pathlib.Path, origin_url: Optional[str] = None, incremental: bool = False, visit_date: Optional[datetime] = None, @@ -606,7 +610,7 @@ self.log.debug( "Clean up temporary directory dump %s for project %s", self.temp_dir, - os.path.basename(self.repo_path), + self.repo_path.name, ) shutil.rmtree(self.temp_dir) @@ -663,7 +667,9 @@ pass return svn_revision - def dump_svn_revisions(self, svn_url: str, last_loaded_svn_rev: int = -1) -> str: + def dump_svn_revisions( + self, svn_url: str, last_loaded_svn_rev: int = -1 + ) -> pathlib.Path: """Generate a subversion dump file using the svnrdump tool. If the svnrdump command failed somehow, the produced dump file is analyzed to determine if a partial loading is still feasible. @@ -682,10 +688,10 @@ # successfully dumped revision numbers are printed to it dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir) dump_name = "".join(c for c in svn_url if c.isalnum()) - dump_path = "%s/%s.svndump" % (dump_temp_dir, dump_name) + dump_path = pathlib.Path("%s/%s.svndump" % (dump_temp_dir, dump_name)) stderr_lines = [] self.log.debug("Executing %s", " ".join(svnrdump_cmd)) - with open(dump_path, "wb") as dump_file: + with dump_path.open("wb") as dump_file: stderr_r, stderr_w = pty.openpty() svnrdump = Popen(svnrdump_cmd, stdout=dump_file, stderr=stderr_w) os.close(stderr_w) @@ -745,7 +751,7 @@ last_dumped_rev, ) - with open(dump_path, "r+b") as f: + with dump_path.open("r+b") as f: with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s: pattern = ( "Revision-number: %s" % (last_dumped_rev + 1) @@ -812,12 +818,12 @@ suffix="-%s" % os.getpid(), root_dir=self.temp_dir, ) - self.svn_url = "file://%s" % self.repo_path + self.svn_url = self.repo_path.as_uri() super().prepare() def cleanup(self): super().cleanup() - if self.temp_dir and os.path.exists(self.temp_dir): + if self.temp_dir and self.temp_dir.exists(): shutil.rmtree(self.temp_dir) def visit_status(self): diff --git a/swh/loader/svn/replay.py b/swh/loader/svn/replay.py --- a/swh/loader/svn/replay.py +++ b/swh/loader/svn/replay.py @@ -16,6 +16,7 @@ from itertools import chain import logging import os +import pathlib import shutil import tempfile from typing import ( @@ -124,7 +125,7 @@ return filetype, target -def is_file_an_svnlink_p(fullpath: bytes) -> Tuple[bool, bytes]: +def is_file_an_svnlink_p(fullpath: pathlib.Path) -> Tuple[bool, bytes]: """Determine if a filepath is an svnlink or something else. Args: @@ -135,7 +136,7 @@ (as per svn) and the link target. """ - if os.path.islink(fullpath): + if fullpath.is_symlink(): return False, b"" with open(fullpath, "rb") as f: filetype, src = read_svn_link(f.read()) @@ -199,14 +200,14 @@ def __init__( self, directory: from_disk.Directory, - rootpath: bytes, - path: bytes, + rootpath: pathlib.Path, + path: pathlib.Path, state: FileState, svnrepo: SvnRepo, ): self.directory = directory self.path = path - self.fullpath = os.path.join(rootpath, path) + self.fullpath = rootpath.joinpath(path) self.state = state self.svnrepo = svnrepo self.editor = svnrepo.swhreplay.editor @@ -227,7 +228,7 @@ # backup end of line style for file self.state.eol_style = value - def __make_symlink(self, src: bytes) -> None: + def __make_symlink(self, src: pathlib.Path) -> None: """Convert the svnlink to a symlink on disk. This function expects self.fullpath to be a svn link. @@ -243,7 +244,7 @@ """ os.remove(self.fullpath) - os.symlink(src=src, dst=self.fullpath) + src.symlink_to(self.fullpath) def __make_svnlink(self) -> bytes: """Convert the symlink to a svnlink on disk. @@ -254,9 +255,9 @@ """ # we replace the symlink by a svnlink # to be able to patch the file on future commits - src = os.readlink(self.fullpath) + src = self.fullpath.readlink() os.remove(self.fullpath) - sbuf = b"link " + src + sbuf = b"link " + bytes(src) with open(self.fullpath, "wb") as f: f.write(sbuf) return sbuf @@ -267,7 +268,7 @@ return lambda *args: None if os.path.lexists(self.fullpath): - if os.path.islink(self.fullpath): + if self.fullpath.is_symlink(): # svn does not deal with symlink so we transform into # real svn symlink for potential patching in later # commits @@ -299,8 +300,8 @@ # since patch has been applied is_link, src = is_file_an_svnlink_p(self.fullpath) if is_link: - self.__make_symlink(src) - elif not os.path.isdir(self.fullpath): # not a real link ... + self.__make_symlink(pathlib.Path(os.fsdecode(src))) + elif not self.fullpath.is_dir(): # not a real link ... # when a file with the svn:special property set is not a svn link, # the svn export operation might extract a truncated version of it # if it is a binary file, so ensure to produce the same file as the @@ -308,7 +309,7 @@ with open(self.fullpath, "rb") as f: content = f.read() self.svnrepo.client.export( - os.path.join(self.svnrepo.remote_url.encode(), self.path), + os.path.join(self.svnrepo.remote_url, self.path), to=self.fullpath, peg_rev=self.editor.revnum, ignore_keywords=True, @@ -320,7 +321,7 @@ # keep track of original file content in order to restore # it if the svn:special property gets unset in another revision self.state.svn_special_path_non_link_data = content - elif os.path.islink(self.fullpath): + elif self.fullpath.is_symlink(): # path was a symbolic link in previous revision but got the property # svn:special unset in current one, revert its content to svn link format self.__make_svnlink() @@ -331,7 +332,7 @@ f.write(self.state.svn_special_path_non_link_data) self.state.svn_special_path_non_link_data = None - is_link = os.path.islink(self.fullpath) + is_link = self.fullpath.is_symlink() if not is_link: # if a link, do nothing regarding flag if self.state.executable == EXEC_FLAG: os.chmod(self.fullpath, 0o755) @@ -347,11 +348,13 @@ data = f.read() data = _normalize_line_endings(data, self.state.eol_style) mode = os.lstat(self.fullpath).st_mode - self.directory[self.path] = from_disk.Content.from_bytes( + self.directory[bytes(self.path)] = from_disk.Content.from_bytes( mode=mode, data=data ) else: - self.directory[self.path] = from_disk.Content.from_file(path=self.fullpath) + self.directory[bytes(self.path)] = from_disk.Content.from_file( + path=self.fullpath + ) @dataclass @@ -359,7 +362,9 @@ """Persists some directory states (eg. externals) across revisions while replaying them.""" - externals: Dict[str, List[Tuple[str, Optional[int]]]] = field(default_factory=dict) + externals: Dict[pathlib.Path, List[Tuple[str, Optional[int]]]] = field( + default_factory=dict + ) """Map a path in the directory to a list of (external_url, revision) targeting it""" @@ -384,28 +389,28 @@ def __init__( self, directory: from_disk.Directory, - rootpath: bytes, - path: bytes, - file_states: Dict[bytes, FileState], - dir_states: Dict[bytes, DirState], + rootpath: pathlib.Path, + path: Optional[pathlib.Path], + file_states: Dict[pathlib.Path, FileState], + dir_states: Dict[pathlib.Path, DirState], svnrepo: SvnRepo, ): self.directory = directory self.rootpath = rootpath - self.path = path + self.path = path or pathlib.Path("") # build directory on init os.makedirs(rootpath, exist_ok=True) self.file_states = file_states self.dir_states = dir_states self.svnrepo = svnrepo self.editor = svnrepo.swhreplay.editor - self.externals: Dict[str, List[Tuple[str, Optional[int], bool]]] = {} + self.externals: Dict[pathlib.Path, List[Tuple[str, Optional[int], bool]]] = {} # repository root dir has empty path if path: self.editor.modified_paths.add(path) - def remove_child(self, path: bytes) -> None: + def remove_child(self, path: pathlib.Path) -> None: """Remove a path from the current objects. The path can be resolved as link, file or directory. @@ -418,12 +423,12 @@ """ try: - entry_removed = self.directory[path] + entry_removed = self.directory[bytes(path)] except KeyError: entry_removed = None else: - del self.directory[path] - fpath = os.path.join(self.rootpath, path) + del self.directory[bytes(path)] + fpath = self.rootpath.joinpath(path) if isinstance(entry_removed, from_disk.Directory): shutil.rmtree(fpath) else: @@ -432,73 +437,74 @@ # when deleting a directory ensure to remove any svn property for the # file it contains as they can be added again later in another revision # without the same property set - fullpath = os.path.join(self.rootpath, path) + fullpath = self.rootpath.joinpath(path) for state_path in list(self.file_states): - if state_path.startswith(fullpath + b"/"): + if state_path.is_relative_to(fullpath): del self.file_states[state_path] self.editor.modified_paths.discard(path) - def open_directory(self, path: str, *args) -> DirEditor: + def open_directory(self, path_str: str, *args) -> DirEditor: """Updating existing directory. """ + path = pathlib.Path(path_str) return DirEditor( self.directory, rootpath=self.rootpath, - path=os.fsencode(path), + path=path, file_states=self.file_states, dir_states=self.dir_states, svnrepo=self.svnrepo, ) - def add_directory(self, path: str, *args) -> DirEditor: + def add_directory(self, path_str: os.PathLike, *args) -> DirEditor: """Adding a new directory. """ - path_bytes = os.fsencode(path) - - os.makedirs(os.path.join(self.rootpath, path_bytes), exist_ok=True) + path = pathlib.Path(path_str) + self.rootpath.joinpath(path).mkdir(exist_ok=True, parents=True) + path_bytes = bytes(path) if path_bytes and path_bytes not in self.directory: - self.dir_states[path_bytes] = DirState() + self.dir_states[path] = DirState() self.directory[path_bytes] = from_disk.Directory() return DirEditor( self.directory, self.rootpath, - path_bytes, + path, self.file_states, self.dir_states, svnrepo=self.svnrepo, ) - def open_file(self, path: str, *args) -> FileEditor: + def open_file(self, path_str: str, *args) -> FileEditor: """Updating existing file. """ - path_bytes = os.fsencode(path) - self.directory[path_bytes] = from_disk.Content() - fullpath = os.path.join(self.rootpath, path_bytes) + path = pathlib.Path(path_str) + self.directory[bytes(path)] = from_disk.Content() + fullpath = self.rootpath.joinpath(path) return FileEditor( self.directory, rootpath=self.rootpath, - path=path_bytes, + path=path, state=self.file_states[fullpath], svnrepo=self.svnrepo, ) - def add_file(self, path: str, *args) -> FileEditor: + def add_file(self, path_str: str, *args) -> FileEditor: """Creating a new file. """ - path_bytes = os.fsencode(path) - self.directory[path_bytes] = from_disk.Content() - fullpath = os.path.join(self.rootpath, path_bytes) + path = pathlib.Path(path_str) + self.directory[bytes(path)] = from_disk.Content() + fullpath = self.rootpath.joinpath(path) self.file_states[fullpath] = FileState() return FileEditor( self.directory, self.rootpath, - path_bytes, + path, state=self.file_states[fullpath], svnrepo=self.svnrepo, ) @@ -529,7 +535,7 @@ revision, relative_url, ) = parse_external_definition( - external, os.fsdecode(self.path), self.svnrepo.origin_url + external, self.path, self.svnrepo.origin_url ) self.externals[path].append((external_url, revision, relative_url)) @@ -538,19 +544,19 @@ # remove associated paths from the reconstructed filesystem externals = self.dir_states[self.path].externals for path in externals.keys(): - self.remove_external_path(os.fsencode(path)) + self.remove_external_path(path) self.dir_states[self.path].externals = {} - def delete_entry(self, path: str, revision: int) -> None: + def delete_entry(self, path_str: str, revision: int) -> None: """Remove a path. """ - path_bytes = os.fsencode(path) - if path_bytes not in self.editor.external_paths: - fullpath = os.path.join(self.rootpath, path_bytes) + path = pathlib.Path(path_str) + if path not in self.editor.external_paths: + fullpath = self.rootpath.joinpath(path) self.file_states.pop(fullpath, None) - self.remove_child(path_bytes) + self.remove_child(path) def close(self): """Function called when we finish processing a repository. @@ -577,7 +583,7 @@ } old_externals = prev_externals_set - externals_set for path, _, _ in old_externals: - self.remove_external_path(os.fsencode(path)) + self.remove_external_path(path) else: # some external paths might have been removed in the current replayed # revision by a delete operation on an overlapping versioned path so we @@ -606,7 +612,7 @@ self.svnrepo.has_recursive_externals = any( is_recursive_external( - self.svnrepo.origin_url, os.fsdecode(path), external_path, external_url + self.svnrepo.origin_url, path, external_path, external_url ) for path, dir_state in self.dir_states.items() for external_path in dir_state.externals.keys() @@ -622,15 +628,15 @@ def process_external( self, - path: str, + path_str: str, external_url: str, revision: Optional[int], relative_url: bool, remove_target_path: bool = True, ) -> None: + path = pathlib.Path(path_str) external = (external_url, revision) - dest_path = os.fsencode(path) - dest_fullpath = os.path.join(self.path, dest_path) + dest_fullpath = self.path.joinpath(path) prev_externals = self.dir_states[self.path].externals if ( path in prev_externals @@ -641,7 +647,7 @@ return if is_recursive_external( - self.svnrepo.origin_url, os.fsdecode(self.path), path, external_url + self.svnrepo.origin_url, self.path, path, external_url ): # recursive external, skip it return @@ -652,16 +658,16 @@ # try to export external in a temporary path, destination path could # be versioned and must be overridden only if the external URL is # still valid - temp_dir = os.fsencode( + temp_dir = pathlib.Path( tempfile.mkdtemp(dir=self.editor.externals_cache_dir) ) - temp_path = os.path.join(temp_dir, dest_path) - os.makedirs(b"/".join(temp_path.split(b"/")[:-1]), exist_ok=True) + temp_path = temp_dir.joinpath(path) + temp_path.parent.mkdir(exist_ok=True, parents=True) if external_url not in self.editor.dead_externals: logger.debug("Exporting external %s to path %s", external_url, path) self.svnrepo.client.export( external_url.rstrip("/"), - to=temp_path, + to=str(temp_path), peg_rev=revision, ignore_keywords=True, ) @@ -677,19 +683,18 @@ # subversion export will always create the subdirectories of the external # path regardless the validity of the remote URL - dest_path_split = dest_path.split(b"/") current_path = self.path - self.add_directory(os.fsdecode(current_path)) - for subpath in dest_path_split[:-1]: - current_path = os.path.join(current_path, subpath) - self.add_directory(os.fsdecode(current_path)) + self.add_directory(current_path) + for subpath in path.parts[:-1]: + current_path = current_path.joinpath(subpath) + self.add_directory(current_path) - if os.path.exists(temp_path): + if temp_path.exists(): # external successfully exported if remove_target_path: # remove previous path in from_disk model - self.remove_external_path(dest_path, remove_subpaths=False) + self.remove_external_path(path, remove_subpaths=False) # mark external as valid self.editor.valid_externals[dest_fullpath] = ( @@ -698,48 +703,48 @@ ) # copy exported path to reconstructed filesystem - fullpath = os.path.join(self.rootpath, dest_fullpath) + fullpath = self.rootpath.joinpath(dest_fullpath) # update from_disk model and store external paths self.editor.external_paths.add(dest_fullpath) self.editor.modified_paths.add(dest_fullpath) - if os.path.isfile(temp_path): - if os.path.islink(fullpath): + if temp_path.is_file(): + if fullpath.is_symlink(): # remove destination file if it is a link os.remove(fullpath) - shutil.copy(os.fsdecode(temp_path), os.fsdecode(fullpath)) - self.directory[dest_fullpath] = from_disk.Content.from_file( - path=fullpath + shutil.copy(temp_path, fullpath) + self.directory[bytes(dest_fullpath)] = from_disk.Content.from_file( + path=bytes(fullpath) ) else: - self.add_directory(os.fsdecode(dest_fullpath)) + self.add_directory(dest_fullpath) # copy_tree needs sub-directories to exist in destination for root, dirs, files in os.walk(temp_path): for dir in dirs: - subdir = os.path.join(root, dir).replace(temp_path + b"/", b"") - self.add_directory( - os.fsdecode(os.path.join(dest_fullpath, subdir)) + subdir = ( + pathlib.Path(root) + .joinpath(pathlib.Path(dir)) + .relative_to(temp_path) ) + self.add_directory(dest_fullpath.joinpath(subdir)) copy_tree( - os.fsdecode(temp_path), - os.fsdecode(fullpath), - preserve_symlinks=True, + str(temp_path), str(fullpath), preserve_symlinks=True, ) # TODO: replace code above by the line below once we use Python >= 3.8 in production # noqa # shutil.copytree(temp_path, fullpath, symlinks=True, dirs_exist_ok=True) # noqa - self.directory[dest_fullpath] = from_disk.Directory.from_disk( - path=fullpath + self.directory[bytes(dest_fullpath)] = from_disk.Directory.from_disk( + path=bytes(fullpath) ) external_paths = set() for root, dirs, files in os.walk(fullpath): external_paths.update( [ - os.path.join(root.replace(self.rootpath + b"/", b""), p) + pathlib.Path(root).relative_to(self.rootpath).joinpath(p) for p in chain(dirs, files) ] ) @@ -747,28 +752,30 @@ self.editor.modified_paths.update(external_paths) # ensure hash update for the directory with externals set - self.directory[self.path].update_hash(force=True) + self.directory[bytes(self.path)].update_hash(force=True) - def remove_external_path(self, external_path: bytes, remove_subpaths=True) -> None: + def remove_external_path( + self, external_path: pathlib.Path, remove_subpaths=True + ) -> None: """Remove a previously exported SVN external path from the reconstruted filesystem. """ - fullpath = os.path.join(self.path, external_path) + fullpath = self.path.joinpath(external_path) self.remove_child(fullpath) self.editor.external_paths.discard(fullpath) self.editor.valid_externals.pop(fullpath, None) for path in list(self.editor.external_paths): - if path.startswith(fullpath + b"/"): + if path.is_relative_to(fullpath): self.editor.external_paths.remove(path) if remove_subpaths: - subpath_split = external_path.split(b"/")[:-1] - for i in reversed(range(1, len(subpath_split) + 1)): + subpath_parts = external_path.parts[:-1] + for i in reversed(range(1, len(subpath_parts) + 1)): # delete external sub-directory only if it is not versioned - subpath = os.path.join(self.path, b"/".join(subpath_split[0:i])) + subpath = self.path.joinpath(*subpath_parts[0:i]) try: self.svnrepo.client.info( - svn_urljoin(self.svnrepo.remote_url, os.fsdecode(subpath)), + svn_urljoin(self.svnrepo.remote_url, str(subpath)), peg_revision=self.editor.revnum, revision=self.editor.revnum, ) @@ -780,17 +787,21 @@ try: # externals can overlap with versioned files so we must restore # them after removing the path above - dest_path = os.path.join(self.rootpath, fullpath) + dest_path = self.rootpath.joinpath(fullpath) self.svnrepo.client.export( - svn_urljoin(self.svnrepo.remote_url, os.fsdecode(fullpath)), - to=dest_path, + svn_urljoin(self.svnrepo.remote_url, str(fullpath)), + to=str(dest_path), peg_rev=self.editor.revnum, ignore_keywords=True, ) - if os.path.isfile(dest_path) or os.path.islink(dest_path): - self.directory[fullpath] = from_disk.Content.from_file(path=dest_path) + if dest_path.is_file() or dest_path.is_symlink(): + self.directory[bytes(fullpath)] = from_disk.Content.from_file( + path=bytes(dest_path) + ) else: - self.directory[fullpath] = from_disk.Directory.from_disk(path=dest_path) + self.directory[bytes(fullpath)] = from_disk.Directory.from_disk( + path=bytes(dest_path) + ) except SubversionException: pass @@ -806,24 +817,27 @@ def __init__( self, - rootpath: bytes, + rootpath: pathlib.Path, directory: from_disk.Directory, svnrepo: SvnRepo, - temp_dir: str, + temp_dir: pathlib.Path, ): self.rootpath = rootpath self.directory = directory - self.file_states: Dict[bytes, FileState] = defaultdict(FileState) - self.dir_states: Dict[bytes, DirState] = defaultdict(DirState) - self.external_paths: Set[bytes] = set() - self.valid_externals: Dict[bytes, Tuple[str, bool]] = {} + self.file_states: Dict[pathlib.Path, FileState] = defaultdict(FileState) + self.dir_states: Dict[pathlib.Path, DirState] = defaultdict(DirState) + self.external_paths: Set[pathlib.Path] = set() + self.valid_externals: Dict[pathlib.Path, Tuple[str, bool]] = {} self.dead_externals: Set[str] = set() self.externals_cache_dir = tempfile.mkdtemp(dir=temp_dir) - self.externals_cache: Dict[Tuple[str, Optional[int]], bytes] = {} + + # {(external_url, revision): path} + self.externals_cache: Dict[Tuple[str, Optional[int]], pathlib.Path] = {} + self.svnrepo = svnrepo self.revnum = None # to store the set of paths added or modified when replaying a revision - self.modified_paths: Set[bytes] = set() + self.modified_paths: Set[pathlib.Path] = set() def set_target_revision(self, revnum) -> None: self.revnum = revnum @@ -840,7 +854,7 @@ return DirEditor( self.directory, rootpath=self.rootpath, - path=b"", + path=None, file_states=self.file_states, dir_states=self.dir_states, svnrepo=self.svnrepo, @@ -854,9 +868,9 @@ def __init__( self, conn: RemoteAccess, - rootpath: bytes, + rootpath: pathlib.Path, svnrepo: SvnRepo, - temp_dir: str, + temp_dir: pathlib.Path, directory: Optional[from_disk.Directory] = None, ): self.conn = conn @@ -905,7 +919,7 @@ directories.append(self.editor.directory.to_model()) for path in self.editor.modified_paths: - obj = self.directory[path].to_model() + obj = self.directory[bytes(path)].to_model() obj_type = obj.object_type if obj_type in (Content.object_type, DiskBackedContent.object_type): contents.append(obj.with_data()) @@ -954,11 +968,9 @@ os.makedirs(local_url, exist_ok=True) - rootpath = tempfile.mkdtemp( - prefix=local_url, suffix="-" + os.path.basename(svn_url) - ) + rootpath = tempfile.mkdtemp(prefix=local_url, suffix="-" + svn_url.name) - rootpath = os.fsencode(rootpath) + rootpath = pathlib.Path(rootpath) # Do not go beyond the repository's latest revision revision_end_max = conn.get_latest_revnum() @@ -986,7 +998,7 @@ print("%s" % rootpath.decode("utf-8")) finally: if cleanup: - if os.path.exists(rootpath): + if rootpath.exists(): shutil.rmtree(rootpath) diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -11,6 +11,7 @@ import logging import os +import pathlib import shutil import tempfile from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -51,7 +52,7 @@ self, remote_url: str, origin_url: str, - local_dirname: str, + local_dirname: pathlib.Path, max_content_length: int, ): self.remote_url = remote_url.rstrip("/") @@ -67,7 +68,7 @@ self.local_dirname = local_dirname local_name = os.path.basename(self.remote_url) - self.local_url = os.path.join(self.local_dirname, local_name).encode("utf-8") + self.local_url = self.local_dirname.joinpath(local_name) self.uuid = self.conn.get_uuid().encode("utf-8") self.swhreplay = replay.Replay( @@ -195,7 +196,7 @@ ): yield self.__to_entry(log_entry) - def export_temporary(self, revision: int) -> Tuple[str, bytes]: + def export_temporary(self, revision: int) -> Tuple[pathlib.Path, pathlib.Path]: """Export the repository to a given revision in a temporary location. This is up to the caller of this function to clean up the temporary location when done (cf. self.clean_fs method) @@ -208,12 +209,14 @@ folder, local_url where the repository was exported. """ - local_dirname = tempfile.mkdtemp( - dir=self.local_dirname, prefix=f"check-revision-{revision}." + local_dirname = pathlib.Path( + tempfile.mkdtemp( + dir=self.local_dirname, prefix=f"check-revision-{revision}." + ) ) local_name = os.path.basename(self.remote_url) - local_url = os.path.join(local_dirname, local_name) + local_url = local_dirname.joinpath(local_name) url = self.remote_url # if some paths have external URLs relative to the repository URL but targeting @@ -278,7 +281,7 @@ ) self.client.export( url, - to=local_url, + to=str(local_url), rev=revision, ignore_keywords=True, ignore_externals=self.has_recursive_externals, @@ -293,7 +296,7 @@ pass else: raise - return local_dirname, os.fsencode(local_url) + return local_dirname, local_url def swh_hash_data_per_revision( self, start_revision: int, end_revision: int @@ -358,7 +361,7 @@ local_dirname, local_url = self.export_temporary(revision) # Compute the current hashes on disk directory = DirectoryFromDisk.from_disk( - path=local_url, max_content_length=self.max_content_length + path=bytes(local_url), max_content_length=self.max_content_length ) # Retrieve the commit information for revision @@ -369,7 +372,7 @@ return commit, directory - def clean_fs(self, local_dirname: Optional[str] = None) -> None: + def clean_fs(self, local_dirname: Optional[pathlib.Path] = None) -> None: """Clean up the local working copy. Args: @@ -378,6 +381,6 @@ """ dirname = local_dirname or self.local_dirname - if os.path.exists(dirname): + if dirname.exists(): logger.debug("cleanup %s", dirname) shutil.rmtree(dirname) diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py --- a/swh/loader/svn/tests/test_utils.py +++ b/swh/loader/svn/tests/test_utils.py @@ -390,6 +390,7 @@ ], ) def test_parse_external_definition(external, dir_path, repo_url, expected_result): - assert ( - utils.parse_external_definition(external, dir_path, repo_url) == expected_result + (path, external_url, revision, relative_url) = utils.parse_external_definition( + external, dir_path, repo_url ) + assert (str(path), external_url, revision, relative_url) == expected_result diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -6,6 +6,7 @@ import errno import logging import os +import pathlib import re import shutil from subprocess import PIPE, Popen, call @@ -59,13 +60,13 @@ def init_svn_repo_from_dump( - dump_path: str, + dump_path: pathlib.Path, prefix: Optional[str] = None, suffix: Optional[str] = None, - root_dir: str = "/tmp", + root_dir: pathlib.Path = pathlib.Path("/tmp"), gzip: bool = False, cleanup_dump: bool = True, -) -> Tuple[str, str]: +) -> Tuple[pathlib.Path, pathlib.Path]: """Given a path to a svn dump, initialize an svn repository with the content of said dump. @@ -89,28 +90,30 @@ """ project_name = os.path.basename(os.path.dirname(dump_path)) - temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir) + temp_dir = pathlib.Path( + tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir) + ) try: - repo_path = os.path.join(temp_dir, project_name) + repo_path = temp_dir.joinpath(project_name) # create the repository that will be loaded with the dump - cmd = ["svnadmin", "create", repo_path] + cmd = ["svnadmin", "create", str(repo_path)] r = call(cmd) if r != 0: raise ValueError( "Failed to initialize empty svn repo for %s" % project_name ) - read_dump_cmd = ["cat", dump_path] + read_dump_cmd = ["cat", str(dump_path)] if gzip: - read_dump_cmd = ["gzip", "-dc", dump_path] + read_dump_cmd = ["gzip", "-dc", str(dump_path)] with Popen(read_dump_cmd, stdout=PIPE) as dump: # load dump and bypass properties validation as Unicode decoding errors # are already handled in loader implementation (see _ra_codecs_error_handler # in ra.py) - cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", repo_path] + cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", str(repo_path)] r = call(cmd, stdin=dump.stdout) if r != 0: raise ValueError( @@ -132,12 +135,12 @@ def init_svn_repo_from_archive_dump( - archive_path: str, + archive_path: pathlib.Path, prefix: Optional[str] = None, suffix: Optional[str] = None, - root_dir: str = "/tmp", + root_dir: pathlib.Path = pathlib.Path("/tmp"), cleanup_dump: bool = True, -) -> Tuple[str, str]: +) -> Tuple[pathlib.Path, pathlib.Path]: """Given a path to an archive containing an svn dump, initializes an svn repository with the content of the uncompressed dump. @@ -170,7 +173,7 @@ ) -def svn_urljoin(base_url: str, *args) -> str: +def svn_urljoin(base_url: str, *args: str) -> str: """Join a base URL and a list of paths in a SVN way. For instance: @@ -197,8 +200,8 @@ def parse_external_definition( - external: str, dir_path: str, repo_url: str -) -> Tuple[str, str, Optional[int], bool]: + external: str, dir_path: pathlib.Path, repo_url: str +) -> Tuple[pathlib.Path, str, Optional[int], bool]: """Parse a subversion external definition. Args: @@ -251,7 +254,7 @@ elif external_part.startswith("../"): # URL relative to the URL of the directory on which the svn:externals # property is set - external_url = svn_urljoin(repo_url, dir_path, external_part) + external_url = svn_urljoin(repo_url, str(dir_path), external_part) relative_url = not external_url.startswith(repo_url) elif re.match(r"^.*:*//.*", external_part): # absolute external URL @@ -291,11 +294,14 @@ except ValueError: # handle URL like http://user@svn.example.org/ pass - return (path.rstrip("/"), external_url, revision, relative_url) + return (pathlib.Path(path), external_url, revision, relative_url) def is_recursive_external( - origin_url: str, dir_path: str, external_path: str, external_url: str + origin_url: str, + dir_path: pathlib.Path, + external_path: pathlib.Path, + external_url: str, ) -> bool: """ Check if an external definition can lead to a recursive subversion export @@ -315,6 +321,6 @@ external_url = urlunparse( parsed_external_url._replace(scheme=parsed_origin_url.scheme) ) - return svn_urljoin(origin_url, quote(dir_path), quote(external_path)).startswith( - external_url - ) + return svn_urljoin( + origin_url, quote(str(dir_path)), quote(str(external_path)) + ).startswith(external_url)