Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
from collections import deque | from collections import deque | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
from shutil import rmtree | from shutil import rmtree | ||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
from typing import Any, Deque, Dict, Optional, Tuple, Union | from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union | ||||
import dateutil | import dateutil | ||||
from swh.core.config import merge_configs | from swh.core.config import merge_configs | ||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.model.from_disk import Content, DentryPerms, Directory | from swh.model.from_disk import Content, DentryPerms, Directory | ||||
from swh.model.hashutil import MultiHash, hash_to_bytehex | from swh.model.hashutil import MultiHash, hash_to_bytehex | ||||
Show All 24 Lines | |||||
DEFAULT_CONFIG: Dict[str, Any] = { | DEFAULT_CONFIG: Dict[str, Any] = { | ||||
"temp_directory": "/tmp", | "temp_directory": "/tmp", | ||||
"clone_timeout_seconds": 7200, | "clone_timeout_seconds": 7200, | ||||
"content_cache_size": 10_000, | "content_cache_size": 10_000, | ||||
} | } | ||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | ||||
T = TypeVar("T") | |||||
def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: | def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: | ||||
"""Convert visit date from Optional[Union[str, datetime]] to Optional[datetime]. | """Convert visit date from Optional[Union[str, datetime]] to Optional[datetime]. | ||||
`HgLoaderFromDisk` accepts `str` and `datetime` as visit date | `HgLoaderFromDisk` accepts `str` and `datetime` as visit date | ||||
while `BaseLoader` only deals with `datetime`. | while `BaseLoader` only deals with `datetime`. | ||||
""" | """ | ||||
if visit_date is None: | if visit_date is None: | ||||
return None | return None | ||||
if isinstance(visit_date, datetime): | if isinstance(visit_date, datetime): | ||||
return visit_date | return visit_date | ||||
if visit_date == "now": | if visit_date == "now": | ||||
return datetime.now(tz=timezone.utc) | return datetime.now(tz=timezone.utc) | ||||
if isinstance(visit_date, str): | if isinstance(visit_date, str): | ||||
return dateutil.parser.parse(visit_date) | return dateutil.parser.parse(visit_date) | ||||
return ValueError(f"invalid visit date {visit_date!r}") | return ValueError(f"invalid visit date {visit_date!r}") | ||||
class HgDirectory(Directory): | class HgDirectory(Directory): | ||||
"""A directory that creates parent directories if missing.""" | """A more practical directory. | ||||
- creates missing parent directories | |||||
- removes empty directories | |||||
""" | |||||
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | ||||
if b"/" in path: | if b"/" in path: | ||||
head, tail = path.split(b"/", 1) | head, tail = path.split(b"/", 1) | ||||
directory = self.get(head) | directory = self.get(head) | ||||
if directory is None: | if directory is None or isinstance(directory, Content): | ||||
directory = HgDirectory() | directory = HgDirectory() | ||||
self[head] = directory | self[head] = directory | ||||
directory[tail] = value | directory[tail] = value | ||||
else: | else: | ||||
super().__setitem__(path, value) | super().__setitem__(path, value) | ||||
def __delitem__(self, path: bytes) -> None: | |||||
super().__delitem__(path) | |||||
while b"/" in path: # remove empty parent directories | |||||
path = path.rsplit(b"/", 1)[0] | |||||
if len(self[path]) == 0: | |||||
super().__delitem__(path) | |||||
else: | |||||
break | |||||
def get( | |||||
self, path: bytes, default: Optional[T] = None | |||||
) -> Optional[Union[Content, "HgDirectory", T]]: | |||||
# TODO move to swh.model.from_disk.Directory | |||||
try: | |||||
return self[path] | |||||
except KeyError: | |||||
return default | |||||
marmoute: We need to iterate from the highest level directory, not the lower lever one, right ?
If I am… | |||||
Done Inline Actionsthe way __setitem__ is implemented in Directory the traversal is done recursivly from lower lever to the higher. Going the other way will repeat the traversal lookup for each level. So in this case starting from the higher seems better to me. The code has evolved, but can you explain why the code is wrong? acezar: the way `__setitem__` is implemented in `Directory` the traversal is done recursivly from lower… | |||||
Done Inline Actions(note: I cannot find the original code anymore, (thanks phab).) In short, if you delete empty directory from root to leaf, you can end up with empty directory anyway. Because we checked it they were empty before deleting they empty children. marmoute: (note: I cannot find the original code anymore, (thanks phab).)
In short, if you delete empty… | |||||
Done Inline Actions(was https://forge.softwareheritage.org/D4540?id=16094#inline-31160) I see now. Thanks. The deletion was not necessary anyway as shown by the new code. acezar: (was https://forge.softwareheritage.org/D4540?id=16094#inline-31160)
I see now. Thanks.
The… | |||||
class HgLoaderFromDisk(BaseLoader): | class HgLoaderFromDisk(BaseLoader): | ||||
"""Load a mercurial repository from a local repository.""" | """Load a mercurial repository from a local repository.""" | ||||
CONFIG_BASE_FILENAME = "loader/mercurial" | CONFIG_BASE_FILENAME = "loader/mercurial" | ||||
visit_type = "hg" | visit_type = "hg" | ||||
def __init__( | def __init__( | ||||
Show All 19 Lines | ): | ||||
self._temp_directory = self.config["temp_directory"] | self._temp_directory = self.config["temp_directory"] | ||||
self._clone_timeout = self.config["clone_timeout_seconds"] | self._clone_timeout = self.config["clone_timeout_seconds"] | ||||
self.origin_url = url | self.origin_url = url | ||||
self.visit_date = parse_visit_date(visit_date) | self.visit_date = parse_visit_date(visit_date) | ||||
self.directory = directory | self.directory = directory | ||||
self._repo: Optional[hgutil.Repository] = None | self._repo: Optional[hgutil.Repository] = None | ||||
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | ||||
Done Inline Actionsplease document this attribute marmoute: please document this attribute | |||||
self._repo_directory: Optional[str] = None | self._repo_directory: Optional[str] = None | ||||
Done Inline Actionsthis should be last_root, should it not ? (also, should probably be _last_hg_nodeid and _last root marmoute: this should be `last_root`, should it not ? (also, should probably be `_last_hg_nodeid` and… | |||||
# keeps the last processed hg nodeid | |||||
# it is used for differential tree update by store_directories | |||||
# NULLID is the parent of the first revision | |||||
self._last_hg_nodeid = hgutil.NULLID | |||||
# keeps the last revision tree | |||||
# it is used for differential tree update by store_directories | |||||
self._last_root = HgDirectory() | |||||
# Cache the content hash across revisions to avoid recalculation. | # Cache the content hash across revisions to avoid recalculation. | ||||
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | ||||
self.config["content_cache_size"], | self.config["content_cache_size"], | ||||
) | ) | ||||
def pre_cleanup(self) -> None: | def pre_cleanup(self) -> None: | ||||
"""As a first step, will try and check for dangling data to cleanup. | """As a first step, will try and check for dangling data to cleanup. | ||||
This should do its best to avoid raising issues. | This should do its best to avoid raising issues. | ||||
▲ Show 20 Lines • Show All 268 Lines • ▼ Show 20 Lines | def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: | ||||
from file paths to obtain each directory hash. | from file paths to obtain each directory hash. | ||||
Args: | Args: | ||||
rev_ctx: the he revision context. | rev_ctx: the he revision context. | ||||
Returns: | Returns: | ||||
the swhid of the top level directory. | the swhid of the top level directory. | ||||
""" | """ | ||||
root = HgDirectory() | repo: hgutil.Repository = self._repo # mypy can't infer that repo is not None | ||||
for file_path in rev_ctx.manifest(): | prev_ctx = repo[self._last_hg_nodeid] | ||||
Done Inline ActionsWe should consider diffing to the parent, but this is not garantee to be a good idea. Possibly reusing the manifest cache would give use a better performance anyway. So the comment should mention the possible change, but not present it as mandatory. marmoute: We should consider diffing to the parent, but this is not garantee to be a good idea. Possibly… | |||||
# TODO maybe do diff on parents | |||||
status = prev_ctx.status(rev_ctx) | |||||
for file_path in status.removed: | |||||
del self._last_root[file_path] | |||||
for file_path in status.added: | |||||
content = self.store_content(rev_ctx, file_path) | content = self.store_content(rev_ctx, file_path) | ||||
root[file_path] = content | self._last_root[file_path] = content | ||||
for file_path in status.modified: | |||||
content = self.store_content(rev_ctx, file_path) | |||||
self._last_root[file_path] = content | |||||
self._last_hg_nodeid = rev_ctx.node() | |||||
directories: Deque[Directory] = deque([root]) | directories: Deque[Directory] = deque([self._last_root]) | ||||
Done Inline ActionsWhy do we need this ? I would expect the possibly cached hash to be invalidated when we alter the structure in the lines above that one. marmoute: Why do we need this ? I would expect the possibly cached hash to be invalidated when we alter… | |||||
while directories: | while directories: | ||||
directory = directories.pop() | directory = directories.pop() | ||||
self.storage.directory_add([directory.to_model()]) | self.storage.directory_add([directory.to_model()]) | ||||
directories.extend( | directories.extend( | ||||
[item for item in directory.values() if isinstance(item, Directory)] | [item for item in directory.values() if isinstance(item, Directory)] | ||||
) | ) | ||||
return root.hash | return self._last_root.hash | ||||
class HgArchiveLoaderFromDisk(HgLoaderFromDisk): | class HgArchiveLoaderFromDisk(HgLoaderFromDisk): | ||||
"""Mercurial loader for repository wrapped within tarballs.""" | """Mercurial loader for repository wrapped within tarballs.""" | ||||
def __init__( | def __init__( | ||||
self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None | self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None | ||||
): | ): | ||||
▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines |
We need to iterate from the highest level directory, not the lower lever one, right ?
If I am not mistaken, this code seems wrong. Why is this not caught by the tests ?