Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/loader.py
Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines | def cleanup(self): | ||||
self.log.error( | self.log.error( | ||||
"""NOT FOR PRODUCTION - debug flag activated | """NOT FOR PRODUCTION - debug flag activated | ||||
Local repository not cleaned up for investigation: %s""", | Local repository not cleaned up for investigation: %s""", | ||||
self.svnrepo.local_url.decode("utf-8"), | self.svnrepo.local_url.decode("utf-8"), | ||||
) | ) | ||||
return | return | ||||
self.svnrepo.clean_fs() | self.svnrepo.clean_fs() | ||||
def swh_revision_hash_tree_at_svn_revision(self, revision: int) -> bytes: | def swh_revision_hash_tree_at_svn_revision( | ||||
self, revision: int | |||||
) -> from_disk.Directory: | |||||
"""Compute and return the hash tree at a given svn revision. | """Compute and return the hash tree at a given svn revision. | ||||
Args: | Args: | ||||
rev: the svn revision we want to check | rev: the svn revision we want to check | ||||
Returns: | Returns: | ||||
The hash tree directory as bytes. | The hash tree directory as bytes. | ||||
""" | """ | ||||
assert self.svnrepo is not None | assert self.svnrepo is not None | ||||
local_dirname, local_url = self.svnrepo.export_temporary(revision) | local_dirname, local_url = self.svnrepo.export_temporary(revision) | ||||
root_dir = from_disk.Directory.from_disk(path=local_url) | root_dir = from_disk.Directory.from_disk(path=local_url) | ||||
self.svnrepo.clean_fs(local_dirname) | self.svnrepo.clean_fs(local_dirname) | ||||
return root_dir.hash | return root_dir | ||||
def _latest_snapshot_revision( | def _latest_snapshot_revision( | ||||
self, | self, | ||||
origin_url: str, | origin_url: str, | ||||
) -> Optional[Tuple[Snapshot, Revision]]: | ) -> Optional[Tuple[Snapshot, Revision]]: | ||||
"""Look for latest snapshot revision and returns it if any. | """Look for latest snapshot revision and returns it if any. | ||||
Args: | Args: | ||||
▲ Show 20 Lines • Show All 113 Lines • ▼ Show 20 Lines | def start_from(self) -> Tuple[int, int]: | ||||
"Processing revisions [%s-%s] for %s", | "Processing revisions [%s-%s] for %s", | ||||
revision_start, | revision_start, | ||||
revision_end, | revision_end, | ||||
self.svnrepo, | self.svnrepo, | ||||
) | ) | ||||
return revision_start, revision_end | return revision_start, revision_end | ||||
def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None: | def _check_revision_divergence( | ||||
self, rev: int, dir_id: bytes, dir: from_disk.Directory | |||||
) -> None: | |||||
"""Check for hash revision computation divergence. | """Check for hash revision computation divergence. | ||||
The Rationale behind this is that svn can trigger unknown edge cases (mixed | The Rationale behind this is that svn can trigger unknown edge cases (mixed | ||||
CRLF, svn properties, etc...). Those are not always easy to spot. Adding a | CRLF, svn properties, etc...). Those are not always easy to spot. Adding a | ||||
regular check will help spotting potential missing edge cases. | regular check will help spotting potential missing edge cases. | ||||
Args: | Args: | ||||
rev: The actual revision we are computing from | rev: The actual revision we are computing from | ||||
dir_id: The actual directory for the given revision | dir_id: The actual directory for the given revision | ||||
Raises | Raises | ||||
ValueError if a hash divergence is detected | ValueError if a hash divergence is detected | ||||
""" | """ | ||||
self.log.debug("Checking hash computations on revision %s...", rev) | self.log.debug("Checking hash computations on revision %s...", rev) | ||||
checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev) | checked_dir = self.swh_revision_hash_tree_at_svn_revision(rev) | ||||
checked_dir_id = checked_dir.hash | |||||
if checked_dir_id != dir_id: | if checked_dir_id != dir_id: | ||||
# do not bother checking tree differences if root directory id of reconstructed | |||||
# repository filesystem does not match the id of the one from the last loaded | |||||
# revision (can happen when called from post_load and tree differences were checked | |||||
# before the last revision to load) | |||||
if self.debug and dir_id == dir.hash: | |||||
for obj in checked_dir.iter_tree(): | |||||
path = obj.data["path"].replace(checked_dir.data["path"], b"")[1:] | |||||
if not path: | |||||
# ignore root directory | |||||
continue | |||||
if path not in dir: | |||||
self.log.debug( | |||||
"%s with path %s is missing in reconstructed repository filesystem", | |||||
obj.object_type, # type: ignore | |||||
path, | |||||
) | |||||
elif dir[path].hash != checked_dir[path].hash: | |||||
self.log.debug( | |||||
"%s with path %s has different hash in reconstructed repository filesystem", # noqa | |||||
obj.object_type, # type: ignore | |||||
path, | |||||
) | |||||
err = ( | err = ( | ||||
"Hash tree computation divergence detected " | "Hash tree computation divergence detected at revision %s " | ||||
"(%s != %s), stopping!" | "(%s != %s), stopping!" | ||||
% ( | % ( | ||||
rev, | |||||
hashutil.hash_to_hex(dir_id), | hashutil.hash_to_hex(dir_id), | ||||
hashutil.hash_to_hex(checked_dir_id), | hashutil.hash_to_hex(checked_dir_id), | ||||
) | ) | ||||
) | ) | ||||
raise ValueError(err) | raise ValueError(err) | ||||
def process_svn_revisions( | def process_svn_revisions( | ||||
self, svnrepo, revision_start, revision_end | self, svnrepo, revision_start, revision_end | ||||
Show All 37 Lines | ]: | ||||
hashutil.hash_to_hex(dir_id), | hashutil.hash_to_hex(dir_id), | ||||
) | ) | ||||
if ( | if ( | ||||
self.check_revision | self.check_revision | ||||
and self.check_revision != 0 | and self.check_revision != 0 | ||||
and count % self.check_revision == 0 | and count % self.check_revision == 0 | ||||
): | ): | ||||
self._check_revision_divergence(rev, dir_id) | self._check_revision_divergence(rev, dir_id, root_directory) | ||||
parents = (swh_revision.id,) | parents = (swh_revision.id,) | ||||
yield _contents, _skipped_contents, _directories, swh_revision | yield _contents, _skipped_contents, _directories, swh_revision | ||||
if not self.debug and self.svnrepo: | if not self.debug and self.svnrepo: | ||||
# clean directory where revisions were replayed to gain some disk space | # clean directory where revisions were replayed to gain some disk space | ||||
# before the post_load operation | # before the post_load operation | ||||
▲ Show 20 Lines • Show All 158 Lines • ▼ Show 20 Lines | def post_load(self, success: bool = True) -> None: | ||||
if success and self._last_revision is not None: | if success and self._last_revision is not None: | ||||
# check if the reconstructed filesystem for the last loaded revision is | # check if the reconstructed filesystem for the last loaded revision is | ||||
# consistent with the one obtained with a svn export operation. If it is not | # consistent with the one obtained with a svn export operation. If it is not | ||||
# the case, an exception will be raised to report the issue and mark the | # the case, an exception will be raised to report the issue and mark the | ||||
# visit as partial | # visit as partial | ||||
self._check_revision_divergence( | self._check_revision_divergence( | ||||
int(dict(self._last_revision.extra_headers)[b"svn_revision"]), | int(dict(self._last_revision.extra_headers)[b"svn_revision"]), | ||||
self._last_revision.directory, | self._last_revision.directory, | ||||
self.svnrepo.swhreplay.directory, | |||||
) | ) | ||||
def _create_tmp_dir(self, root_tmp_dir: str) -> str: | def _create_tmp_dir(self, root_tmp_dir: str) -> str: | ||||
return tempfile.mkdtemp( | return tempfile.mkdtemp( | ||||
dir=root_tmp_dir, | dir=root_tmp_dir, | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
suffix="-%s" % os.getpid(), | suffix="-%s" % os.getpid(), | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 277 Lines • Show Last 20 Lines |