Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/loader.py
Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines | from swh.model.hashutil import ( | ||||
MultiHash, | MultiHash, | ||||
hash_to_hex, | hash_to_hex, | ||||
hash_to_bytehex, | hash_to_bytehex, | ||||
hash_to_bytes, | hash_to_bytes, | ||||
DEFAULT_ALGORITHMS, | DEFAULT_ALGORITHMS, | ||||
) | ) | ||||
from swh.loader.core.loader import DVCSLoader | from swh.loader.core.loader import DVCSLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.storage.algos.origin import origin_get_latest_visit_status | |||||
from . import converters | from . import converters | ||||
from .archive_extract import tmp_extract | from .archive_extract import tmp_extract | ||||
from .bundle20_reader import Bundle20Reader | from .bundle20_reader import Bundle20Reader | ||||
from .converters import PRIMARY_ALGO as ALGO | from .converters import PRIMARY_ALGO as ALGO | ||||
from .objects import SelectiveCache, SimpleTree | from .objects import SelectiveCache, SimpleTree | ||||
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines | ): | ||||
self.temp_directory = self.config["temp_directory"] | self.temp_directory = self.config["temp_directory"] | ||||
self.cache1_size = self.config["cache1_size"] | self.cache1_size = self.config["cache1_size"] | ||||
self.cache2_size = self.config["cache2_size"] | self.cache2_size = self.config["cache2_size"] | ||||
self.clone_timeout = self.config["clone_timeout_seconds"] | self.clone_timeout = self.config["clone_timeout_seconds"] | ||||
self.working_directory = None | self.working_directory = None | ||||
self.bundle_path = None | self.bundle_path = None | ||||
self.heads = {} | self.heads = {} | ||||
self.releases = {} | self.releases = {} | ||||
self.last_snapshot_id: Optional[bytes] = None | |||||
def pre_cleanup(self): | def pre_cleanup(self): | ||||
"""Cleanup potential dangling files from prior runs (e.g. OOM killed | """Cleanup potential dangling files from prior runs (e.g. OOM killed | ||||
tasks) | tasks) | ||||
""" | """ | ||||
clean_dangling_folders( | clean_dangling_folders( | ||||
self.temp_directory, | self.temp_directory, | ||||
Show All 35 Lines | def get_heads(self, repo): | ||||
return b | return b | ||||
def prepare_origin_visit(self, *args, **kwargs) -> None: | def prepare_origin_visit(self, *args, **kwargs) -> None: | ||||
self.origin = Origin(url=self.origin_url) | self.origin = Origin(url=self.origin_url) | ||||
visit_date = self.visit_date | visit_date = self.visit_date | ||||
if isinstance(visit_date, str): # visit_date can be string or datetime | if isinstance(visit_date, str): # visit_date can be string or datetime | ||||
visit_date = parser.parse(visit_date) | visit_date = parser.parse(visit_date) | ||||
self.visit_date = visit_date | self.visit_date = visit_date | ||||
self.last_visit = self.storage.origin_visit_get_latest( | visit_and_status = origin_get_latest_visit_status( | ||||
self.origin_url, require_snapshot=True | self.storage, self.origin_url, require_snapshot=True | ||||
) | ) | ||||
if visit_and_status is None: | |||||
self.last_snapshot_id = None | |||||
else: | |||||
_, visit_status = visit_and_status | |||||
self.last_snapshot_id = visit_status.snapshot | |||||
ardumont: I could require branches_count=1 as well as we don't do much with that snapshot.
Currently that… | |||||
Done Inline Actionswell, i did ;) ardumont: well, i did ;)
shout out if you don't agree. | |||||
@staticmethod | @staticmethod | ||||
def clone_with_timeout(log, origin, destination, timeout): | def clone_with_timeout(log, origin, destination, timeout): | ||||
queue = billiard.Queue() | queue = billiard.Queue() | ||||
start = time.monotonic() | start = time.monotonic() | ||||
def do_clone(queue, origin, destination): | def do_clone(queue, origin, destination): | ||||
try: | try: | ||||
▲ Show 20 Lines • Show All 411 Lines • ▼ Show 20 Lines | def get_fetch_history_result(self): | ||||
"directories": self.num_directories, | "directories": self.num_directories, | ||||
"revisions": self.num_revisions, | "revisions": self.num_revisions, | ||||
"releases": self.num_releases, | "releases": self.num_releases, | ||||
} | } | ||||
def load_status(self): | def load_status(self): | ||||
snapshot = self.get_snapshot() | snapshot = self.get_snapshot() | ||||
load_status = "eventful" | load_status = "eventful" | ||||
if self.last_visit is not None and self.last_visit["snapshot"] == snapshot.id: | if self.last_snapshot_id is not None and self.last_snapshot_id == snapshot.id: | ||||
load_status = "uneventful" | load_status = "uneventful" | ||||
Not Done Inline ActionsIs there a corner case situation here where snapshots[-1] is a partial snapshot (e.g. because of a failure in the middle of the previous loading) but snapshots[-2] is ok, and current snapshot is actually the same this later? In other words, should we check for existence of a previous snapshot equal to the current one, and not only the last one? douardda: Is there a corner case situation here where snapshots[-1] is a partial snapshot (e.g. because… | |||||
Done Inline ActionsI guess that could happen yes. We could adapt to iterate over all the snapshots we know for the origin. I don't really know though:
All in all, i found the current code simple enough and to the point for most cases. ardumont: I guess that could happen yes.
Note that i'm just porting the code here ;)
We could adapt to… | |||||
return { | return { | ||||
"status": load_status, | "status": load_status, | ||||
} | } | ||||
class HgArchiveBundle20Loader(HgBundle20Loader): | class HgArchiveBundle20Loader(HgBundle20Loader): | ||||
"""Mercurial loader for repository wrapped within archives. | """Mercurial loader for repository wrapped within archives. | ||||
Show All 29 Lines |
I could require branches_count=1 as well as we don't do much with that snapshot.
Currently that call could end up being heavy if the snapshot is huge (and in the end we only use it to have its id...