diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -15,7 +15,7 @@ import shutil from subprocess import Popen import tempfile -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Sequence, Tuple from subvertpy import SubversionException @@ -52,9 +52,7 @@ class SvnLoader(BaseLoader): - """Swh svn loader. - - The repository is either remote or local. The loader deals with + """SVN loader. The repository is either remote or local. The loader deals with update on an already previously loaded repository. """ @@ -67,20 +65,27 @@ url: str, origin_url: Optional[str] = None, visit_date: Optional[datetime] = None, - destination_path: Optional[str] = None, - swh_revision: Optional[str] = None, incremental: bool = True, temp_directory: str = "/tmp", debug: bool = False, check_revision: int = 0, max_content_size: Optional[int] = None, ): - """Load an svn repository. + """Load a svn repository (either remote or local). Args: - ... + url: The default origin url + origin_url: Optional original url override to use as origin reference in the + archive. If not provided, "url" is used as origin. + visit_date: Optional date to override the visit date incremental: If True, the default, starts from the last snapshot (if any). Otherwise, starts from the initial commit of the repository. + temp_directory: The temporary working directory to use for computations + debug: If true, run the loader in debug mode. At the end of the loading, the + temporary working directory is not cleaned up to ease inspection. + Defaults to false. + check_revision: The number of svn commits between checks for hash divergence + max_content_size: Default max content size allowed """ super().__init__( @@ -109,7 +114,6 @@ self._visit_status = "full" self._load_status = "uneventful" self.visit_date = visit_date - self.destination_path = destination_path self.incremental = incremental self.snapshot: Optional[Snapshot] = None # state from previous visit @@ -142,16 +146,17 @@ return self.svnrepo.clean_fs() - def swh_revision_hash_tree_at_svn_revision(self, revision): + def swh_revision_hash_tree_at_svn_revision(self, revision: int) -> bytes: """Compute and return the hash tree at a given svn revision. Args: - rev (int): the svn revision we want to check + rev: the svn revision we want to check Returns: The hash tree directory as bytes. """ + assert self.svnrepo is not None local_dirname, local_url = self.svnrepo.export_temporary(revision) h = from_disk.Directory.from_disk(path=local_url).hash self.svnrepo.clean_fs(local_dirname) @@ -191,7 +196,9 @@ return None return latest_snapshot, revision - def build_swh_revision(self, rev, commit, dir_id, parents): + def build_swh_revision( + self, rev: int, commit: Dict, dir_id: bytes, parents: Sequence[bytes] + ) -> Revision: """Build the swh revision dictionary. This adds: @@ -201,21 +208,22 @@ svn revision number. Args: - rev (int): the svn revision number - commit (dict): the commit data: revision id, date, author, and message - dir_id (bytes): the upper tree's hash identifier - parents ([bytes]): the parents' identifiers + rev: the svn revision number + commit: the commit data: revision id, date, author, and message + dir_id: the upper tree's hash identifier + parents: the parents' identifiers Returns: The swh revision corresponding to the svn revision. """ + assert self.svnrepo is not None return converters.build_swh_revision( rev, commit, self.svnrepo.uuid, dir_id, parents ) def check_history_not_altered( - self, svnrepo, revision_start: int, swh_rev: Revision + self, svnrepo: SvnRepo, revision_start: int, swh_rev: Revision ) -> bool: """Given a svn repository, check if the history was modified in between visits. @@ -302,40 +310,31 @@ return revision_start, revision_end, revision_parents - def _check_revision_divergence(self, count, rev, dir_id): + def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None: """Check for hash revision computation divergence. - The Rationale behind this is that svn can trigger unknown - edge cases (mixed CRLF, svn properties, etc...). Those are - not always easy to spot. Adding a check will help in - spotting missing edge cases. + The Rationale behind this is that svn can trigger unknown edge cases (mixed + CRLF, svn properties, etc...). Those are not always easy to spot. Adding a + regular check will help spotting potential missing edge cases. Args: - count (int): The number of revisions done so far - rev (dict): The actual revision we are computing from - dir_id (bytes): The actual directory for the given revision - - Returns: - False if no hash divergence detected + rev: The actual revision we are computing from + dir_id: The actual directory for the given revision Raises ValueError if a hash divergence is detected """ - # hash computation check - if (self.check_revision != 0 and count % self.check_revision) == 0: - self.log.debug("Checking hash computations on revision %s...", rev) - checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev) - if checked_dir_id != dir_id: - err = ( - "Hash tree computation divergence detected " - "(%s != %s), stopping!" - % ( - hashutil.hash_to_hex(dir_id), - hashutil.hash_to_hex(checked_dir_id), - ) - ) - raise ValueError(err) + + self.log.debug("Checking hash computations on revision %s...", rev) + checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev) + if checked_dir_id != dir_id: + err = ( + "Hash tree computation divergence detected " + "(%s != %s), stopping!" + % (hashutil.hash_to_hex(dir_id), hashutil.hash_to_hex(checked_dir_id),) + ) + raise ValueError(err) def process_svn_revisions( self, svnrepo, revision_start, revision_end, revision_parents @@ -381,8 +380,12 @@ hashutil.hash_to_hex(dir_id), ) - if self.check_revision: - self._check_revision_divergence(count, rev, dir_id) + if ( + self.check_revision + and self.check_revision != 0 + and count % self.check_revision == 0 + ): + self._check_revision_divergence(rev, dir_id) if nextrev: revision_parents[nextrev] = [swh_revision.id] @@ -397,14 +400,11 @@ if latest_snapshot_revision: self.latest_snapshot, self.latest_revision = latest_snapshot_revision - if self.destination_path: - local_dirname = self.destination_path - else: - local_dirname = tempfile.mkdtemp( - suffix="-%s" % os.getpid(), - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - dir=self.temp_directory, - ) + local_dirname = tempfile.mkdtemp( + suffix="-%s" % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=self.temp_directory, + ) try: self.svnrepo = SvnRepo( @@ -544,22 +544,19 @@ def post_load(self, success: bool = True) -> None: if success and self._last_revision is not None: - # force revision divergence check - self.check_revision = 1 # check if the reconstructed filesystem for the last loaded revision is - # consistent with the one obtained with a svn export operation, if it is - # not an exception will be raised to report the issue and mark the visit - # as partial + # consistent with the one obtained with a svn export operation. If it is not + # the case, an exception will be raised to report the issue and mark the + # visit as partial self._check_revision_divergence( - self.check_revision, int(dict(self._last_revision.extra_headers)[b"svn_revision"]), self._last_revision.directory, ) class SvnLoaderFromDumpArchive(SvnLoader): - """Uncompress an archive containing an svn dump, mount the svn dump as - an svn repository and load said repository. + """Uncompress an archive containing an svn dump, mount the svn dump as a local svn + repository and load that repository. """ @@ -569,8 +566,6 @@ url: str, archive_path: str, origin_url: Optional[str] = None, - destination_path: Optional[str] = None, - swh_revision: Optional[str] = None, incremental: bool = False, visit_date: Optional[datetime] = None, temp_directory: str = "/tmp", @@ -582,8 +577,6 @@ storage=storage, url=url, origin_url=origin_url, - destination_path=destination_path, - swh_revision=swh_revision, incremental=incremental, visit_date=visit_date, temp_directory=temp_directory, @@ -619,9 +612,9 @@ class SvnLoaderFromRemoteDump(SvnLoader): - """ - Create a subversion repository dump using the svnrdump utility, - mount it locally and load the repository from it. + """Create a subversion repository dump out of a remote svn repository (using the + svnrdump utility). Then, mount the repository locally and load that repository. + """ def __init__( @@ -629,8 +622,6 @@ storage: StorageInterface, url: str, origin_url: Optional[str] = None, - destination_path: Optional[str] = None, - swh_revision: Optional[str] = None, incremental: bool = True, visit_date: Optional[datetime] = None, temp_directory: str = "/tmp", @@ -642,8 +633,6 @@ storage=storage, url=url, origin_url=origin_url, - destination_path=destination_path, - swh_revision=swh_revision, incremental=incremental, visit_date=visit_date, temp_directory=temp_directory, @@ -674,15 +663,17 @@ pass return svn_revision - def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1): - """ - Generate a subversion dump file using the svnrdump tool. - If the svnrdump command failed somehow, - the produced dump file is analyzed to determine if a partial - loading is still feasible. + def dump_svn_revisions(self, svn_url: str, last_loaded_svn_rev: int = -1) -> str: + """Generate a subversion dump file using the svnrdump tool. If the svnrdump + command failed somehow, the produced dump file is analyzed to determine if a + partial loading is still feasible. Raises: NotFound when the repository is no longer found at url + + Returns: + The dump_path of the repository mounted + """ # Build the svnrdump command line svnrdump_cmd = ["svnrdump", "dump", svn_url] diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -24,8 +24,6 @@ *, url: Optional[str] = None, origin_url: Optional[str] = None, - destination_path: Optional[str] = None, - swh_revision: Optional[str] = None, visit_date: Optional[str] = None, incremental: Optional[bool] = True, ): @@ -35,9 +33,6 @@ url: (mandatory) svn's repository url to ingest data from origin_url: Optional original url override to use as origin reference in the archive. If not provided, "url" is used as origin. - destination_path: (optional) root directory to locally retrieve svn's data - swh_revision: (optional) extra revision hex to start from. See - swh.loader.svn.SvnLoader.process docstring visit_date: Optional date to override the visit date incremental: If True, the default, starts from the last snapshot (if any). Otherwise, starts from the initial commit of the repository. @@ -48,8 +43,6 @@ loader = SvnLoader.from_configfile( url=url, origin_url=origin_url, - destination_path=destination_path, - swh_revision=swh_revision, visit_date=convert_to_datetime(visit_date), incremental=incremental, ) diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -55,7 +55,7 @@ def test_loader_svn_not_found_no_mock(swh_storage, tmp_path): """Given an unknown repository, the loader visit ends up in status not_found""" repo_url = "unknown-repository" - loader = SvnLoader(swh_storage, repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "uneventful"} @@ -73,7 +73,7 @@ mock.side_effect = SubversionException(exception_msg, 0) unknown_repo_url = "unknown-repository" - loader = SvnLoader(swh_storage, unknown_repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, unknown_repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "uneventful"} @@ -96,7 +96,7 @@ mock.side_effect = exception existing_repo_url = "existing-repo-url" - loader = SvnLoader(swh_storage, existing_repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, existing_repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "failed"} @@ -110,7 +110,7 @@ unknown_repo_url = "file:///tmp/svn.code.sf.net/p/white-rats-studios/svn" loader = SvnLoaderFromRemoteDump( - swh_storage, unknown_repo_url, destination_path=tmp_path + swh_storage, unknown_repo_url, temp_directory=tmp_path ) assert loader.load() == {"status": "uneventful"} @@ -143,16 +143,14 @@ ) repo_url = f"file://{repo_path}" - loader = SvnLoaderFromRemoteDump( - swh_storage, repo_url, destination_path=loading_path - ) + loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=loading_path) assert loader.load() == {"status": "eventful"} actual_visit = assert_last_visit_matches( swh_storage, repo_url, status="full", type="svn", ) loader2 = SvnLoaderFromRemoteDump( - swh_storage, repo_url, destination_path=loading_path + swh_storage, repo_url, temp_directory=loading_path ) # Visiting a second time the same repository should be uneventful... assert loader2.load() == {"status": "uneventful"} @@ -171,7 +169,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = SvnLoader(swh_storage, repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "eventful"} @@ -239,7 +237,7 @@ )[0] assert start_revision is not None - loader = SvnLoader(swh_storage, repo_url, swh_revision=start_revision) + loader = SvnLoader(swh_storage, repo_url) assert loader.load() == {"status": "uneventful"} stats = get_stats(loader.storage) @@ -424,12 +422,7 @@ ) # we'll start from start_revision - loader = SvnLoader( - swh_storage, - repo_updated_url, - origin_url=repo_initial_url, - swh_revision=start_revision, - ) + loader = SvnLoader(swh_storage, repo_updated_url, origin_url=repo_initial_url,) assert loader.load() == {"status": "eventful"} @@ -764,7 +757,7 @@ archive_path = os.path.join(datadir, f"{archive_name}-add-remove-dir.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = SvnLoader(swh_storage, repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -914,9 +907,7 @@ # instantiate a svn loader checking after each processed revision that # the repository filesystem it reconstructed does not differ from a subversion # export of that revision - loader = SvnLoader( - swh_storage, repo_url, destination_path=tmp_path, check_revision=1 - ) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -980,9 +971,7 @@ # instantiate a svn loader checking after each processed revision that # the repository filesystem it reconstructed does not differ from a subversion # export of that revision - loader = SvnLoader( - swh_storage, repo_url, destination_path=tmp_path, check_revision=1 - ) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -1072,9 +1061,7 @@ # instantiate a svn loader checking after each processed revision that # the repository filesystem it reconstructed does not differ from a subversion # export of that revision - loader = SvnLoader( - swh_storage, repo_url, destination_path=tmp_path, check_revision=1 - ) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -1137,9 +1124,7 @@ # instantiate a svn loader checking after each processed revision that # the repository filesystem it reconstructed does not differ from a subversion # export of that revision - loader = SvnLoader( - swh_storage, repo_url, destination_path=tmp_path, check_revision=1 - ) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -1181,7 +1166,7 @@ ], ) - loader = SvnLoader(swh_storage, repo_url, destination_path=tmp_path) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path) # post loading will detect an issue and make a partial visit with a snapshot assert loader.load() == {"status": "failed"} @@ -1249,9 +1234,7 @@ # instantiate a svn loader checking after each processed revision that # the repository filesystem it reconstructed does not differ from a subversion # export of that revision - loader = SvnLoader( - swh_storage, repo_url, destination_path=tmp_path, check_revision=1 - ) + loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1) assert loader.load() == {"status": "eventful"} assert_last_visit_matches( @@ -1268,9 +1251,7 @@ def _check_revision_divergence(self, count, rev, dir_id): raise ValueError("revision divergence detected") - loader = SvnLoaderRevisionDivergence( - swh_storage, repo_url, destination_path=tmp_path - ) + loader = SvnLoaderRevisionDivergence(swh_storage, repo_url, temp_directory=tmp_path) assert loader.load() == {"status": "failed"}