Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/loader.py
Show First 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | class SvnLoader(BaseLoader): | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
origin_url: Optional[str] = None, | origin_url: Optional[str] = None, | ||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | ||||
destination_path: Optional[str] = None, | destination_path: Optional[str] = None, | ||||
swh_revision: Optional[str] = None, | swh_revision: Optional[str] = None, | ||||
start_from_scratch: bool = False, | incremental: bool = True, | ||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | ||||
debug: bool = False, | debug: bool = False, | ||||
check_revision: int = 0, | check_revision: int = 0, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
"""Load an svn repository. | |||||
Args: | |||||
... | |||||
incremental: If True, the default, starts from the last snapshot (if any). | |||||
Otherwise, starts from the initial commit of the repository. | |||||
""" | |||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
logging_class="swh.loader.svn.SvnLoader", | logging_class="swh.loader.svn.SvnLoader", | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
) | ) | ||||
# technical svn uri to act on svn repository | # technical svn uri to act on svn repository | ||||
self.svn_url = url | self.svn_url = url | ||||
# origin url as unique identifier for origin in swh archive | # origin url as unique identifier for origin in swh archive | ||||
Show All 11 Lines | ): | ||||
self._revisions: List[Revision] = [] | self._revisions: List[Revision] = [] | ||||
self._snapshot: Optional[Snapshot] = None | self._snapshot: Optional[Snapshot] = None | ||||
# internal state, current visit | # internal state, current visit | ||||
self._last_revision = None | self._last_revision = None | ||||
self._visit_status = "full" | self._visit_status = "full" | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
self.visit_date = visit_date | self.visit_date = visit_date | ||||
self.destination_path = destination_path | self.destination_path = destination_path | ||||
self.start_from_scratch = start_from_scratch | self.incremental = incremental | ||||
self.snapshot = None | self.snapshot = None | ||||
# state from previous visit | # state from previous visit | ||||
self.latest_snapshot = None | self.latest_snapshot = None | ||||
self.latest_revision = None | self.latest_revision = None | ||||
def pre_cleanup(self): | def pre_cleanup(self): | ||||
"""Cleanup potential dangling files from prior runs (e.g. OOM killed | """Cleanup potential dangling files from prior runs (e.g. OOM killed | ||||
tasks) | tasks) | ||||
▲ Show 20 Lines • Show All 105 Lines • ▼ Show 20 Lines | ) -> bool: | ||||
rev = revision_start | rev = revision_start | ||||
rev, _, commit, _, root_dir = list(hash_data_per_revs)[0] | rev, _, commit, _, root_dir = list(hash_data_per_revs)[0] | ||||
dir_id = root_dir.hash | dir_id = root_dir.hash | ||||
swh_revision = self.build_swh_revision(rev, commit, dir_id, parents) | swh_revision = self.build_swh_revision(rev, commit, dir_id, parents) | ||||
swh_revision_id = swh_revision.id | swh_revision_id = swh_revision.id | ||||
return swh_revision_id == revision_id | return swh_revision_id == revision_id | ||||
def start_from( | def start_from(self) -> Tuple[int, int, Dict[int, Tuple[bytes, ...]]]: | ||||
self, start_from_scratch: bool = False | |||||
) -> Tuple[int, int, Dict[int, Tuple[bytes, ...]]]: | |||||
"""Determine from where to start the loading. | """Determine from where to start the loading. | ||||
Args: | |||||
start_from_scratch: As opposed to start from the last snapshot | |||||
Returns: | Returns: | ||||
tuple (revision_start, revision_end, revision_parents) | tuple (revision_start, revision_end, revision_parents) | ||||
Raises: | Raises: | ||||
SvnLoaderHistoryAltered: When a hash divergence has been | SvnLoaderHistoryAltered: When a hash divergence has been | ||||
detected (should not happen) | detected (should not happen) | ||||
SvnLoaderUneventful: Nothing changed since last visit | SvnLoaderUneventful: Nothing changed since last visit | ||||
""" | """ | ||||
assert self.svnrepo is not None, "svnrepo initialized in the `prepare` method" | assert self.svnrepo is not None, "svnrepo initialized in the `prepare` method" | ||||
revision_head = self.svnrepo.head_revision() | revision_head = self.svnrepo.head_revision() | ||||
if revision_head == 0: # empty repository case | if revision_head == 0: # empty repository case | ||||
revision_start = 0 | revision_start = 0 | ||||
revision_end = 0 | revision_end = 0 | ||||
else: # default configuration | else: # default configuration | ||||
revision_start = self.svnrepo.initial_revision() | revision_start = self.svnrepo.initial_revision() | ||||
revision_end = revision_head | revision_end = revision_head | ||||
revision_parents: Dict[int, Tuple[bytes, ...]] = {revision_start: ()} | revision_parents: Dict[int, Tuple[bytes, ...]] = {revision_start: ()} | ||||
# start from a previous revision if any | # start from a previous revision if any | ||||
if not start_from_scratch and self.latest_revision is not None: | if self.incremental and self.latest_revision is not None: | ||||
extra_headers = dict(self.latest_revision.extra_headers) | extra_headers = dict(self.latest_revision.extra_headers) | ||||
revision_start = int(extra_headers[b"svn_revision"]) | revision_start = int(extra_headers[b"svn_revision"]) | ||||
revision_parents = { | revision_parents = { | ||||
revision_start: self.latest_revision.parents, | revision_start: self.latest_revision.parents, | ||||
} | } | ||||
self.log.debug( | self.log.debug( | ||||
"svn export --ignore-keywords %s@%s", | "svn export --ignore-keywords %s@%s", | ||||
▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines | def prepare(self): | ||||
] | ] | ||||
for msg in error_msgs: | for msg in error_msgs: | ||||
if msg in e.args[0]: | if msg in e.args[0]: | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
raise NotFound(e) | raise NotFound(e) | ||||
raise | raise | ||||
try: | try: | ||||
revision_start, revision_end, revision_parents = self.start_from( | revision_start, revision_end, revision_parents = self.start_from() | ||||
self.start_from_scratch | |||||
) | |||||
self.swh_revision_gen = self.process_svn_revisions( | self.swh_revision_gen = self.process_svn_revisions( | ||||
self.svnrepo, revision_start, revision_end, revision_parents | self.svnrepo, revision_start, revision_end, revision_parents | ||||
) | ) | ||||
except SvnLoaderUneventful as e: | except SvnLoaderUneventful as e: | ||||
self.log.warning(e) | self.log.warning(e) | ||||
if self.latest_snapshot: | if self.latest_snapshot: | ||||
self._snapshot = self.latest_snapshot | self._snapshot = self.latest_snapshot | ||||
self.done = True | self.done = True | ||||
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines | class SvnLoaderFromDumpArchive(SvnLoader): | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
archive_path: str, | archive_path: str, | ||||
origin_url: Optional[str] = None, | origin_url: Optional[str] = None, | ||||
destination_path: Optional[str] = None, | destination_path: Optional[str] = None, | ||||
swh_revision: Optional[str] = None, | swh_revision: Optional[str] = None, | ||||
start_from_scratch: bool = False, | incremental: bool = False, | ||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | ||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | ||||
debug: bool = False, | debug: bool = False, | ||||
check_revision: int = 0, | check_revision: int = 0, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
url=url, | url=url, | ||||
origin_url=origin_url, | origin_url=origin_url, | ||||
destination_path=destination_path, | destination_path=destination_path, | ||||
swh_revision=swh_revision, | swh_revision=swh_revision, | ||||
start_from_scratch=start_from_scratch, | incremental=incremental, | ||||
visit_date=visit_date, | visit_date=visit_date, | ||||
temp_directory=temp_directory, | temp_directory=temp_directory, | ||||
debug=debug, | debug=debug, | ||||
check_revision=check_revision, | check_revision=check_revision, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
) | ) | ||||
self.archive_path = archive_path | self.archive_path = archive_path | ||||
self.temp_dir = None | self.temp_dir = None | ||||
Show All 30 Lines | class SvnLoaderFromRemoteDump(SvnLoader): | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
origin_url: Optional[str] = None, | origin_url: Optional[str] = None, | ||||
destination_path: Optional[str] = None, | destination_path: Optional[str] = None, | ||||
swh_revision: Optional[str] = None, | swh_revision: Optional[str] = None, | ||||
start_from_scratch: bool = False, | incremental: bool = True, | ||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | ||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | ||||
debug: bool = False, | debug: bool = False, | ||||
check_revision: int = 0, | check_revision: int = 0, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
url=url, | url=url, | ||||
origin_url=origin_url, | origin_url=origin_url, | ||||
destination_path=destination_path, | destination_path=destination_path, | ||||
swh_revision=swh_revision, | swh_revision=swh_revision, | ||||
start_from_scratch=start_from_scratch, | incremental=incremental, | ||||
visit_date=visit_date, | visit_date=visit_date, | ||||
temp_directory=temp_directory, | temp_directory=temp_directory, | ||||
debug=debug, | debug=debug, | ||||
check_revision=check_revision, | check_revision=check_revision, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
) | ) | ||||
self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) | self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) | ||||
self.repo_path = None | self.repo_path = None | ||||
▲ Show 20 Lines • Show All 158 Lines • Show Last 20 Lines |