Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines | def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: | ||||
if isinstance(visit_date, str): | if isinstance(visit_date, str): | ||||
return dateutil.parser.parse(visit_date) | return dateutil.parser.parse(visit_date) | ||||
return ValueError(f"invalid visit date {visit_date!r}") | return ValueError(f"invalid visit date {visit_date!r}") | ||||
class HgDirectory(Directory): | class HgDirectory(Directory): | ||||
"""A directory that creates parent directories if missing.""" | """A more practical directory. | ||||
- creates missing parent directories | |||||
- removes empty directories | |||||
""" | |||||
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | ||||
if b"/" in path: | if b"/" in path: | ||||
head, tail = path.split(b"/", 1) | head, tail = path.split(b"/", 1) | ||||
directory = self.get(head) | directory = self.get(head) | ||||
if directory is None: | if directory is None or isinstance(directory, Content): | ||||
directory = HgDirectory() | directory = HgDirectory() | ||||
self[head] = directory | self[head] = directory | ||||
directory[tail] = value | directory[tail] = value | ||||
else: | else: | ||||
super().__setitem__(path, value) | super().__setitem__(path, value) | ||||
def __delitem__(self, path: bytes) -> None: | |||||
super().__delitem__(path) | |||||
while b"/" in path: # remove empty parent directories | |||||
path = path.rsplit(b"/", 1)[0] | |||||
if len(self[path]) == 0: | |||||
super().__delitem__(path) | |||||
class HgLoaderFromDisk(BaseLoader): | class HgLoaderFromDisk(BaseLoader): | ||||
"""Load a mercurial repository from a local repository.""" | """Load a mercurial repository from a local repository.""" | ||||
CONFIG_BASE_FILENAME = "loader/mercurial" | CONFIG_BASE_FILENAME = "loader/mercurial" | ||||
visit_type = "hg" | visit_type = "hg" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
url: str, | url: str, | ||||
directory: Optional[str] = None, | directory: Optional[str] = None, | ||||
marmoute: We need to iterate from the highest level directory, not the lower lever one, right ?
If I am… | |||||
Done Inline Actionsthe way __setitem__ is implemented in Directory the traversal is done recursivly from lower lever to the higher. Going the other way will repeat the traversal lookup for each level. So in this case starting from the higher seems better to me. The code has evolved, but can you explain why the code is wrong? acezar: the way `__setitem__` is implemented in `Directory` the traversal is done recursivly from lower… | |||||
Done Inline Actions(note: I cannot find the original code anymore, (thanks phab).) In short, if you delete empty directory from root to leaf, you can end up with empty directory anyway. Because we checked it they were empty before deleting they empty children. marmoute: (note: I cannot find the original code anymore, (thanks phab).)
In short, if you delete empty… | |||||
Done Inline Actions(was https://forge.softwareheritage.org/D4540?id=16094#inline-31160) I see now. Thanks. The deletion was not necessary anyway as shown by the new code. acezar: (was https://forge.softwareheritage.org/D4540?id=16094#inline-31160)
I see now. Thanks.
The… | |||||
logging_class: str = "swh.loader.mercurial.LoaderFromDisk", | logging_class: str = "swh.loader.mercurial.LoaderFromDisk", | ||||
visit_date: Optional[Union[datetime, str]] = None, | visit_date: Optional[Union[datetime, str]] = None, | ||||
config: Optional[Dict[str, Any]] = None, | config: Optional[Dict[str, Any]] = None, | ||||
): | ): | ||||
"""Initialize the loader. | """Initialize the loader. | ||||
Args: | Args: | ||||
url: url of the repository. | url: url of the repository. | ||||
directory: directory of the local repository. | directory: directory of the local repository. | ||||
logging_class: class of the loader logger. | logging_class: class of the loader logger. | ||||
visit_date: visit date of the repository | visit_date: visit date of the repository | ||||
config: loader configuration | config: loader configuration | ||||
""" | """ | ||||
super().__init__(logging_class=logging_class, config=config or {}) | super().__init__(logging_class=logging_class, config=config or {}) | ||||
self.config = merge_configs(DEFAULT_CONFIG, self.config) | self.config = merge_configs(DEFAULT_CONFIG, self.config) | ||||
self._temp_directory = self.config["temp_directory"] | self._temp_directory = self.config["temp_directory"] | ||||
self._clone_timeout = self.config["clone_timeout_seconds"] | self._clone_timeout = self.config["clone_timeout_seconds"] | ||||
self.origin_url = url | self.origin_url = url | ||||
self.visit_date = parse_visit_date(visit_date) | self.visit_date = parse_visit_date(visit_date) | ||||
self.directory = directory | self.directory = directory | ||||
self._repo: Optional[hgutil.Repository] = None | self._repo: Optional[hgutil.Repository] = None | ||||
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} | ||||
Done Inline Actionsplease document this attribute marmoute: please document this attribute | |||||
Done Inline Actionsthis should be last_root, should it not ? (also, should probably be _last_hg_nodeid and _last root marmoute: this should be `last_root`, should it not ? (also, should probably be `_last_hg_nodeid` and… | |||||
# keeps the last processed hg nodeid | |||||
# it is used for differential tree update by store_directories | |||||
# NULLID is the parent of the first revision | |||||
self._last_hg_nodeid = hgutil.NULLID | |||||
# keeps the last revision tree | |||||
# it is used for differential tree update by store_directories | |||||
self._last_root = HgDirectory() | |||||
# Cache the content hash across revisions to avoid recalculation. | # Cache the content hash across revisions to avoid recalculation. | ||||
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | ||||
self.config["content_cache_size"], | self.config["content_cache_size"], | ||||
) | ) | ||||
@property | @property | ||||
def repo(self) -> hgutil.Repository: | def repo(self) -> hgutil.Repository: | ||||
"""A filtered mercurial repository. | """A filtered mercurial repository. | ||||
▲ Show 20 Lines • Show All 280 Lines • ▼ Show 20 Lines | def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: | ||||
from file paths to obtain each directory hash. | from file paths to obtain each directory hash. | ||||
Args: | Args: | ||||
rev_ctx: the he revision context. | rev_ctx: the he revision context. | ||||
Returns: | Returns: | ||||
the swhid of the top level directory. | the swhid of the top level directory. | ||||
""" | """ | ||||
root = HgDirectory() | prev_ctx = self.repo[self._last_hg_nodeid] | ||||
for file_path in rev_ctx.manifest(): | |||||
# TODO maybe do diff on parents | |||||
Done Inline ActionsWe should consider diffing to the parent, but this is not garantee to be a good idea. Possibly reusing the manifest cache would give use a better performance anyway. So the comment should mention the possible change, but not present it as mandatory. marmoute: We should consider diffing to the parent, but this is not garantee to be a good idea. Possibly… | |||||
status = prev_ctx.status(rev_ctx) | |||||
for file_path in status.removed: | |||||
del self._last_root[file_path] | |||||
for file_path in status.added: | |||||
content = self.store_content(rev_ctx, file_path) | content = self.store_content(rev_ctx, file_path) | ||||
root[file_path] = content | self._last_root[file_path] = content | ||||
for file_path in status.modified: | |||||
content = self.store_content(rev_ctx, file_path) | |||||
self._last_root[file_path] = content | |||||
self._last_hg_nodeid = rev_ctx.node() | |||||
directories: Deque[Directory] = deque([root]) | directories: Deque[Directory] = deque([self._last_root]) | ||||
Done Inline ActionsWhy do we need this ? I would expect the possibly cached hash to be invalidated when we alter the structure in the lines above that one. marmoute: Why do we need this ? I would expect the possibly cached hash to be invalidated when we alter… | |||||
while directories: | while directories: | ||||
directory = directories.pop() | directory = directories.pop() | ||||
self.storage.directory_add([directory.to_model()]) | self.storage.directory_add([directory.to_model()]) | ||||
directories.extend( | directories.extend( | ||||
[item for item in directory.values() if isinstance(item, Directory)] | [item for item in directory.values() if isinstance(item, Directory)] | ||||
) | ) | ||||
return root.hash | return self._last_root.hash | ||||
class HgArchiveLoaderFromDisk(HgLoaderFromDisk): | class HgArchiveLoaderFromDisk(HgLoaderFromDisk): | ||||
"""Mercurial loader for repository wrapped within tarballs.""" | """Mercurial loader for repository wrapped within tarballs.""" | ||||
def __init__( | def __init__( | ||||
self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None | self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None | ||||
): | ): | ||||
▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines |
We need to iterate from the highest level directory, not the lower lever one, right ?
If I am not mistaken, this code seems wrong. Why is this not caught by the tests ?