Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/from_disk.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
from collections import deque | from collections import deque | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
from shutil import rmtree | from shutil import rmtree | ||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
from typing import Any, Deque, Dict, Optional, Tuple, Union | from typing import Any, Deque, Dict, Optional, Tuple, Union | ||||
import dateutil | import dateutil | ||||
marmoute: I am not super happy to import some internal Mercurial class, but there don't seems to be an… | |||||
from swh.core.config import merge_configs | from swh.core.config import merge_configs | ||||
from swh.loader.core.loader import BaseLoader | from swh.loader.core.loader import BaseLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.model.from_disk import Content, DentryPerms, Directory | from swh.model.from_disk import Content, DentryPerms, Directory | ||||
from swh.model.hashutil import MultiHash, hash_to_bytehex | from swh.model.hashutil import MultiHash, hash_to_bytehex | ||||
from swh.model.model import Content as ModelContent | from swh.model.model import Content as ModelContent | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines | ): | ||||
self.origin_url = url | self.origin_url = url | ||||
self.visit_date = parse_visit_date(visit_date) | self.visit_date = parse_visit_date(visit_date) | ||||
self.directory = directory | self.directory = directory | ||||
self._repo: Optional[hgutil.Repository] = None | self._repo: Optional[hgutil.Repository] = None | ||||
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | ||||
self._repo_directory: Optional[str] = None | self._repo_directory: Optional[str] = None | ||||
# Cache the content hash across revisions to avoid recalculation. | |||||
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | |||||
self.config["content_cache_size"], | |||||
) | |||||
def pre_cleanup(self) -> None: | def pre_cleanup(self) -> None: | ||||
"""As a first step, will try and check for dangling data to cleanup. | """As a first step, will try and check for dangling data to cleanup. | ||||
This should do its best to avoid raising issues. | This should do its best to avoid raising issues. | ||||
""" | """ | ||||
clean_dangling_folders( | clean_dangling_folders( | ||||
self._temp_directory, | self._temp_directory, | ||||
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
▲ Show 20 Lines • Show All 216 Lines • ▼ Show 20 Lines | def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content: | ||||
rev_ctx: the he revision context. | rev_ctx: the he revision context. | ||||
file_path: the hg path of the content. | file_path: the hg path of the content. | ||||
Returns: | Returns: | ||||
the swhid of the top level directory. | the swhid of the top level directory. | ||||
""" | """ | ||||
hg_nodeid = rev_ctx.node() | hg_nodeid = rev_ctx.node() | ||||
file_ctx = rev_ctx[file_path] | file_ctx = rev_ctx[file_path] | ||||
Done Inline ActionsThis should be file_ctx.filenode(), should it not ? marmoute: This should be `file_ctx.filenode()`, should it not ? | |||||
file_nodeid = file_ctx.filenode() | |||||
perms = FLAG_PERMS[file_ctx.flags()] | perms = FLAG_PERMS[file_ctx.flags()] | ||||
data = file_ctx.data() # caching is simple and will come in the next revision. | |||||
# Key is file_nodeid + perms because permissions does not participate | |||||
# in content hash in hg while it is the case in swh. | |||||
cache_key = (file_nodeid, perms) | |||||
sha1_git = self._content_hash_cache.get(cache_key) | |||||
if sha1_git is not None: | |||||
return Content({"sha1_git": sha1_git, "perms": perms}) | |||||
data = file_ctx.data() | |||||
content_data = MultiHash.from_data(data).digest() | content_data = MultiHash.from_data(data).digest() | ||||
content_data["length"] = len(data) | content_data["length"] = len(data) | ||||
content_data["perms"] = perms | content_data["perms"] = perms | ||||
content_data["data"] = data | content_data["data"] = data | ||||
content_data["status"] = "visible" | content_data["status"] = "visible" | ||||
content = Content(content_data) | content = Content(content_data) | ||||
model = content.to_model() | model = content.to_model() | ||||
if isinstance(model, ModelContent): | if isinstance(model, ModelContent): | ||||
self.storage.content_add([model]) | self.storage.content_add([model]) | ||||
else: | else: | ||||
raise ValueError( | raise ValueError( | ||||
f"{file_path!r} at rev {hg_nodeid.hex()!r} " | f"{file_path!r} at rev {hg_nodeid.hex()!r} " | ||||
"produced {type(model)!r} instead of {ModelContent!r}" | "produced {type(model)!r} instead of {ModelContent!r}" | ||||
) | ) | ||||
self._content_hash_cache[cache_key] = content.hash | |||||
# Here we make sure to return only necessary data. | # Here we make sure to return only necessary data. | ||||
return Content({"sha1_git": content.hash, "perms": perms}) | return Content({"sha1_git": content.hash, "perms": perms}) | ||||
def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: | def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: | ||||
"""Store a revision directories given its hg nodeid. | """Store a revision directories given its hg nodeid. | ||||
Mercurial as no directory as in git. A Git like tree must be build | Mercurial as no directory as in git. A Git like tree must be build | ||||
from file paths to obtain each directory hash. | from file paths to obtain each directory hash. | ||||
▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines |
I am not super happy to import some internal Mercurial class, but there don't seems to be an easily accessible lru cache implementation otherwise.
Could we move this in hgutil to avoid importing Mercurial directly from here ?