Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
- This file was added.
import os | |||||
import textwrap | |||||
from abc import ABC, abstractmethod | |||||
from datetime import datetime | |||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | |||||
import mercurial.ui # type: ignore | |||||
from mercurial import hg | |||||
from mercurial import tags as tagsmod | |||||
from swh.loader.core.loader import DVCSLoader | |||||
marmoute: We should add a comment about the fact the internal Mercurial API is not guaranteed to be… | |||||
Done Inline ActionsThis comment still apply, (but the line moved a bit further down the file) marmoute: This comment still apply, (but the line moved a bit further down the file) | |||||
from swh.model.hashutil import MultiHash, hash_to_bytes | |||||
from swh.model.model import ( | |||||
BaseContent, | |||||
Content, | |||||
Directory, | |||||
ObjectType, | |||||
Origin, | |||||
Person, | |||||
Release, | |||||
Revision, | |||||
RevisionType, | |||||
Snapshot, | |||||
TargetType, | |||||
Timestamp, | |||||
TimestampWithTimezone, | |||||
) | |||||
DIR_PERM = 0o040000 | |||||
FLAG_PERMS = { | |||||
b"l": 0o120000, # symlink flag | |||||
b"x": 0o100755, # exec flag | |||||
b"": 0o100644, # no flag | |||||
} # type: Dict[bytes, int] | |||||
class HgLoaderFromDisk(DVCSLoader): | |||||
"""Load a mercurial repository from a cloned mercurial directory. | |||||
Done Inline ActionsYou can use the enum values in swh.model.from_disk.DentryPerms vlorentz: You can use the enum values in `swh.model.from_disk.DentryPerms` | |||||
Done Inline ActionsThis one still applies too, if I understand correctly. marmoute: This one still applies too, if I understand correctly. | |||||
""" | |||||
CONFIG_BASE_FILENAME = "loader/mercurial" | |||||
visit_type = "hg" | |||||
def __init__( | |||||
self, | |||||
url: str, | |||||
directory: str, # Path to a cloned mercurial repository | |||||
logging_class="swh.loader.mercurial.Loader", | |||||
visit_date: Optional[datetime] = None, | |||||
Done Inline Actionswhy is this a comment rather than a properly crafted docstring? douardda: why is this a comment rather than a properly crafted docstring? | |||||
Done Inline ActionsHaving a full docstring to document the constructor argument would be nice. marmoute: Having a full docstring to document the constructor argument would be nice. | |||||
Done Inline ActionsThis could use a small docstring. marmoute: This could use a small docstring. | |||||
config: Optional[Dict[str, Any]] = None, | |||||
): | |||||
Done Inline Actionss/data/date/ I guess douardda: `s/data/date/` I guess | |||||
super().__init__(logging_class=logging_class, config=config or {}) | |||||
self.origin_url = url | |||||
self.visit_date = visit_date | |||||
self.directory = directory | |||||
def prepare_origin_visit(self, *args, **kwargs) -> None: | |||||
"""First step executed by the loader to prepare origin and visit | |||||
Done Inline Actionswhy do we need to use urlsplit on a directory path? If we expect the directory argument to (possibly) be a file:// url, then it should be told explicitly. douardda: why do we need to use urlsplit on a directory path? If we expect the `directory` argument to… | |||||
Done Inline ActionsSince the loader is "from disk" assuming that this is a local path seems fine. But why do we need urlsplit then ? If we have to use urlsplit, can we explicitly check that the other value are the one we expect for a local path ? marmoute: Since the loader is "from disk" assuming that this is a local path seems fine. But why do we… | |||||
references. Set/update self.origin, and | |||||
optionally self.origin_url, self.visit_date. | |||||
""" | |||||
self.origin = Origin(url=self.origin_url) | |||||
def prepare(self, *args, **kwargs): | |||||
Done Inline ActionsYou should document the class purpose and scope. marmoute: You should document the class purpose and scope. | |||||
"""Second step executed by the loader to prepare some state needed by | |||||
the loader. | |||||
""" | |||||
Done Inline Actionsplease explain why the "unfiltered" is there douardda: please explain why the "unfiltered" is there | |||||
Done Inline Actionsthat question is still not answered (however the part of the hunk it relates to is a few lines below) douardda: that question is still not answered (however the part of the hunk it relates to is a few lines… | |||||
ui = mercurial.ui.ui.load() | |||||
self.repo = hg.repository(ui, self.directory.encode()).unfiltered() | |||||
self.hashes_cache = build_hashes_cache(self.repo) | |||||
self.tags_rev = get_tags_rev(self.repo) | |||||
def save_data(self) -> None: | |||||
"""Save the data associated to the current load""" | |||||
# TODO may be implemented with incremental loading | |||||
Done Inline Actionssmall nits: could simply be visit_and_status[1].snapshot marmoute: small nits: could simply be `visit_and_status[1].snapshot` | |||||
return | |||||
def fetch_data(self) -> bool: | |||||
"""Fetch the data from the data source.""" | |||||
return False # No data to fetch since we use an already cloned repo | |||||
def has_contents(self) -> bool: | |||||
"""Checks whether we need to load contents""" | |||||
# TODO may be implemented with incremental loading | |||||
Done Inline Actions... here: explain unfiltered() douardda: ... here: explain `unfiltered() ` | |||||
Done Inline ActionsThe unfiltered is lifting any possible filtering (eg: hidden changeset) so that we import all available contents. This is good when importing revision, but we should probably go back to the visible set when we start to read branch head information. marmoute: The `unfiltered` is lifting any possible filtering (eg: hidden changeset) so that we import all… | |||||
return True | |||||
def get_contents(self) -> Iterable[BaseContent]: | |||||
Done Inline ActionsMaybe add a comment about visibility here. marmoute: Maybe add a comment about visibility here. | |||||
"""Get the contents that need to be loaded""" | |||||
_hashes = [] | |||||
for rev in self.hashes_cache: | |||||
ctx = self.repo[rev] | |||||
Done Inline ActionsWhat is this function about? And why cannot we have it now? marmoute: What is this function about? And why cannot we have it now? | |||||
Done Inline Actionsnote: the docstring seems to need an update. marmoute: note: the docstring seems to need an update. | |||||
for filename in self.hashes_cache[rev]: | |||||
fctx = ctx[filename] | |||||
data = fctx.data() | |||||
Done Inline ActionsBut the local clone might still need update, right? What would be the final state of this can we document the TODO here? marmoute: But the local clone might still need update, right? What would be the final state of this can… | |||||
hashes = self.hashes_cache[rev][filename] | |||||
_hashes.append(hashes["sha1_git"]) | |||||
yield Content( | |||||
data=data, status="visible", length=len(data), **hashes, | |||||
Done Inline ActionsSame question (mostly for me to understand the situation better) what is this function about and what is the long terme goal for it? marmoute: Same question (mostly for me to understand the situation better) what is this function about… | |||||
) | |||||
def has_directories(self) -> bool: | |||||
"""Checks whether we need to load directories""" | |||||
Done Inline ActionsWhat's this _hashes for? douardda: What's this `_hashes` for? | |||||
Done Inline Actions+1? marmoute: +1? | |||||
# TODO may be implemented with incremental loading | |||||
Done Inline ActionsAlso, why don't we build the hashes_caches during this step? douardda: Also, why don't we build the `hashes_caches` during this step? | |||||
Done Inline ActionsSo if I understand correctly, this function is fetching all the "files" content, right ? And we do so by iterating over all revisions, and all files in each of these revisions? Why are we getting the revision from self.hashes_cache instead of getting them the repository? Regarding the general approach, uploading all the files first is "good enough" for a first rough version to validate that we generate the right data for SWH, but is not going to fly for any real size repository. For them, we will need to move from the "all files → all manifests → all revisions" approach to move to a "for each rev: all files in rev → manifest of the rev → rev". We need to be explicite about that. Either in this changeset description or in the comments (or maybe both). marmoute: So if I understand correctly, this function is fetching all the "files" content, right ? And we… | |||||
return True | |||||
def get_directories(self) -> Iterable[Directory]: | |||||
Done Inline ActionsSame question, why are we getting the filename from the hashcache ? marmoute: Same question, why are we getting the filename from the hashcache ? | |||||
"""Get the directories that need to be loaded""" | |||||
self.rev_dir_hashes: Dict[bytes, bytes] = {} | |||||
for rev in self.hashes_cache: | |||||
tree = TreeDirectory(None) | |||||
ctx = self.repo[rev] | |||||
for filepath in self.hashes_cache[rev]: | |||||
fctx = ctx[filepath] | |||||
Done Inline ActionsI find it a bit weird to have the _get_revisions store all the underlying content but the revision itself. Maybe we should return already store revision there. What do you think ? marmoute: I find it a bit weird to have the _get_revisions store all the underlying content but the… | |||||
tree.add_file( | |||||
Done Inline Actionssmall typo: "An filtered" → "A filtered" marmoute: small typo: "An filtered" → "A filtered" | |||||
path=filepath, | |||||
Done Inline Actionsshouldn't the repo_directory attribute be "declared" here too? douardda: shouldn't the `repo_directory` attribute be "declared" here too? | |||||
perms=FLAG_PERMS[fctx.flags()], | |||||
sha1_git=self.hashes_cache[rev][filepath]["sha1_git"], | |||||
) | |||||
Done Inline ActionsSame question as for the other method that need to be implemented for incremental loading. marmoute: Same question as for the other method that need to be implemented for incremental loading. | |||||
yield from tree.directories() | |||||
self.rev_dir_hashes[rev] = tree.sha1_git() | |||||
Done Inline ActionsI may be missing something but why is this a property? why not just initialize a self.repo in the __fetch_data__ method (where the repo_directory is initialized)? douardda: I may be missing something but why is this a property? why not just initialize a `self.repo` in… | |||||
Done Inline ActionsYou're right it has been added for an old need and is no more necessary. acezar: You're right it has been added for an old need and is no more necessary. | |||||
def has_revisions(self) -> bool: | |||||
Done Inline ActionsSame question (and I assume, same answer) marmoute: Same question (and I assume, same answer) | |||||
"""Checks whether we need to load revisions""" | |||||
# TODO may be implemented with incremental loading | |||||
return True | |||||
Done Inline Actions(idem) marmoute: (idem) | |||||
def get_revisions(self) -> Iterable[Revision]: | |||||
"""Get the revisions that need to be loaded""" | |||||
# Keep track of calculated revisions sha1_git | |||||
# for release and snapshot generation | |||||
self._node_hashes: Dict[bytes, bytes] = {} | |||||
for rev in self.repo: | |||||
ctx = self.repo[rev] | |||||
parents = tuple( | |||||
[ | |||||
Done Inline Actions(idem) marmoute: (idem) | |||||
self._node_hashes[p.node()] # revisions are in increasing order so | |||||
# this won't fail | |||||
for p in ctx.parents() | |||||
if p.node() != mercurial.node.nullid | |||||
] | |||||
) | |||||
author = author_dict_from_str(ctx.user()) | |||||
rev_date = get_ctx_date(ctx).to_dict() | |||||
Done Inline ActionsIs this an attribute declaration ? If so, this should be in the __init__. marmoute: Is this an attribute declaration ? If so, this should be in the `__init__`. | |||||
revision = { | |||||
"author": author, | |||||
"date": rev_date, | |||||
"committer": author, | |||||
"committer_date": rev_date, | |||||
"type": RevisionType.MERCURIAL.value, | |||||
"directory": self.rev_dir_hashes[rev], | |||||
"message": ctx.description(), | |||||
"metadata": { | |||||
f"hg-extra-{key}": value for key, value in ctx.extra().items() | |||||
}, | |||||
"synthetic": False, | |||||
"parents": parents, | |||||
} | |||||
revision["id"] = hash_to_bytes(Revision.compute_hash(revision)) | |||||
self._node_hashes[ctx.node()] = revision["id"] | |||||
yield Revision.from_dict(revision) | |||||
def has_releases(self) -> bool: | |||||
"""Checks whether we need to load releases""" | |||||
# TODO may be implemented with incremental loading | |||||
Done Inline Actionscould we have a bit of comment here (for the non-mercurial expert)? douardda: could we have a bit of comment here (for the non-mercurial expert)? | |||||
return True | |||||
def get_releases(self) -> Iterable[Release]: | |||||
Done Inline Actionsalso a comment here to explain it's fore backward compat vlorentz: also a comment here to explain it's fore backward compat | |||||
"""Get the releases that need to be loaded""" | |||||
# Keep track of original tags target for snapshot generation | |||||
Done Inline ActionsHere we seems to assume that the existing repository is up to date. In which case we we already have repository and why don't wee need to pull it ? marmoute: Here we seems to assume that the existing repository is up to date. In which case we we already… | |||||
Done Inline Actions(gentle ping) marmoute: (gentle ping) | |||||
self.tags_target = {} | |||||
for (name, nodeid) in self.repo.tags().items(): | |||||
target_ctx = self.repo[nodeid] | |||||
self.tags_target[name] = target_ctx.node() | |||||
if name == b"tip": | |||||
continue | |||||
if nodeid == self.tags_rev[name]["target"]: | |||||
Done Inline ActionsIt is still unclear to me when/why we have this special case. Can you clarify that point ? marmoute: It is still unclear to me when/why we have this special case. Can you clarify that point ? | |||||
tag_ctx = self.repo[self.tags_rev[name]["rev"]] | |||||
Done Inline ActionsSmall nits: If they are no other usage than the cli call, maybe pulling is not necessary. marmoute: Small nits: If they are no other usage than the cli call, maybe pulling is not necessary. | |||||
Done Inline ActionsRemoved the TODO acezar: Removed the TODO | |||||
release = { | |||||
"author": author_dict_from_str(tag_ctx.user()), | |||||
"date": get_ctx_date(tag_ctx).to_dict(), | |||||
Done Inline Actionswe should probably call this hg_nodeid for clarity. marmoute: we should probably call this `hg_nodeid` for clarity. | |||||
"name": name, | |||||
"target": self._node_hashes[target_ctx.node()], | |||||
"target_type": ObjectType.REVISION.value, | |||||
Done Inline ActionsI may be wrong but belive there is no need for computing the revision id "by hand", something like: rev_obj = Revision.from_dict(revision) self._node_hashes[ctx.node()] = rev_obj.id yield rev_obj should work (the id for a model.HashableObject is automatically computed in a post-init hook if not provided) douardda: I may be wrong but belive there is no need for computing the revision id "by hand", something… | |||||
Done Inline ActionsI even recommend skipping from_dict entirely, so you get type-checking of the arguments: revision = Revision( author=author, date=rev_date, ... ) (and it spares an indirection) vlorentz: I even recommend skipping from_dict entirely, so you get type-checking of the arguments:
```… | |||||
Done Inline Actions+1 marmoute: +1 | |||||
"message": None, | |||||
"metadata": None, | |||||
"synthetic": False, | |||||
} | |||||
release["id"] = hash_to_bytes(Release.compute_hash(release)) | |||||
yield Release.from_dict(release) | |||||
else: | |||||
Done Inline ActionsSame question, is this a new attribute initialisation ? marmoute: Same question, is this a new attribute initialisation ? | |||||
# TODO: should not happen -> warning or exception. | |||||
pass | |||||
Done Inline ActionsI'm not very fond of instance attributes being created all over the methods... douardda: I'm not very fond of instance attributes being created all over the methods... | |||||
Done Inline Actionsyeah, they should be created in the constructor (with a type annotation) so mypy can detect them vlorentz: yeah, they should be created in the constructor (with a type annotation) so mypy can detect them | |||||
Done Inline Actions+1 marmoute: +1 | |||||
def get_snapshot(self) -> Snapshot: | |||||
"""Get the snapshot that needs to be loaded""" | |||||
branches: Dict[bytes, Any] = {} | |||||
Done Inline ActionsWhy do we need to yield the (mercurial) nodei here ? marmoute: Why do we need to yield the (mercurial) nodei here ? | |||||
for name, heads in self.repo.branchmap().items(): | |||||
# When a branch has multiple heads, the 1st is stored as "branch/{name}" | |||||
# while the next are stored has "wild-branch/{branch}/{target} | |||||
used_names = [] # To know if the first head of branch has used the name | |||||
for head in heads: | |||||
Done Inline ActionsI don't what's this is doing douardda: I don't what's this is doing | |||||
Done Inline ActionsIf I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the SWH revision ID. marmoute: If I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the… | |||||
ctx = self.repo[head] | |||||
Done Inline ActionsIf we are to store directory one by one, could we simply store them as we build them ? It would make the flow simpler and help us with future optimisation (eg: not recreating directory that already exists) marmoute: If we are to store directory one by one, could we simply store them as we build them ? It would… | |||||
target = self._node_hashes[ctx.node()] | |||||
Done Inline ActionsThis method could use a docstring. marmoute: This method could use a docstring. | |||||
if name in used_names: | |||||
ref = b"wild-branch/" + name + b"/" + target.hex().encode() | |||||
else: | |||||
ref = b"branch/" + name | |||||
Done Inline ActionsNot doing any optimization here at first is good. However, it would be nice to add comment about which optimization are planned where we plan them. It would also help to make sure we have the right structure for these optimization to come to play. marmoute: Not doing any optimization here at first is good. However, it would be nice to add comment… | |||||
used_names.append(name) | |||||
branches[ref] = { | |||||
"target": target, | |||||
"target_type": TargetType.REVISION.value, | |||||
} | |||||
for name, target in self.tags_target.items(): | |||||
# The "tip" tag is a special tag that is more like a head. | |||||
Done Inline ActionsI suspect the same feedback as for revision apply here, could we create the Release directly instead ? marmoute: I suspect the same feedback as for revision apply here, could we create the Release directly… | |||||
# All heads should have been handled before. | |||||
Done Inline ActionsI assume the sub directory will be created automatically by the TreeDirectory API, it might be worth a small comment. marmoute: I assume the sub directory will be created automatically by the TreeDirectory API, it might be… | |||||
# There is no need to process "tip". | |||||
Done Inline ActionsException is quite wide. We need something more accurate. If SWH does not have something more accurate already, using at least ValueError seem appropriate. marmoute: Exception is quite wide. We need something more accurate. If SWH does not have something more… | |||||
Done Inline ActionsWhy do have this recursivity ? In which case do we expect parent to not be stored because trying to store their children ? marmoute: Why do have this recursivity ? In which case do we expect parent to not be stored because… | |||||
if name != b"tip": | |||||
ref = b"tag/" + name | |||||
branches[ref] = { | |||||
"target": self._node_hashes[target], | |||||
Done Inline Actionsit would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped into swh's (git-like) one here. douardda: it would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped… | |||||
Done Inline Actions+1, this is a very needed, because this is going to be one of the main question that will arrive from any mapping. marmoute: +1, this is a very needed, because this is going to be one of the main question that will… | |||||
"target_type": TargetType.REVISION.value, | |||||
} | |||||
Done Inline ActionsI would move these utility higher in the module to have them defined before the code that use them. marmoute: I would move these utility higher in the module to have them defined before the code that use… | |||||
snapshot = {"branches": branches} | |||||
snapshot["id"] = hash_to_bytes(Snapshot.compute_hash({"branches": branches})) | |||||
Done Inline Actionswhy is this for loop needed? why not just self.tags_target['tip'] ? douardda: why is this for loop needed? why not just `self.tags_target['tip']` ? | |||||
Done Inline Actions+1 marmoute: +1 | |||||
return Snapshot.from_dict(snapshot) | |||||
Done Inline ActionsShould we pass rev_ctx here instead ? marmoute: Should we pass rev_ctx here instead ? | |||||
Done Inline ActionsLooks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ? marmoute: Looks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ? | |||||
Done Inline Actionsit was about passing context rather than the nodeid and ask the context again to the repo in the method. (was on self.store_directories(hg_nodeid)) acezar: it was about passing context rather than the `nodeid` and ask the context again to the repo in… | |||||
def load_status(self): | |||||
"""Detailed loading status. | |||||
Defaults to logging an eventful load. | |||||
Returns: a dictionary that is eventually passed back as the task's | |||||
result to the scheduler, allowing tuning of the task recurrence | |||||
mechanism. | |||||
""" | |||||
Done Inline ActionsWhy do we need a HEAD ? and why is it tip ? marmoute: Why do we need a HEAD ? and why is it tip ? | |||||
Done Inline ActionsThe questions still stand. marmoute: The questions still stand. | |||||
Done Inline ActionsAdded comment stating that it match the historical implementation. acezar: Added comment stating that it match the historical implementation. | |||||
return { | |||||
Done Inline ActionsI am a bit warry about the reuse of internal attribute to fetch these information. Especially because they semantic is not documented so far. marmoute: I am a bit warry about the reuse of internal attribute to fetch these information. Especially… | |||||
"status": "eventful", | |||||
} | |||||
Done Inline Actions
marmoute: - "incremental loading" is probably an overloaded ambiguous terms, lets stop using it. | |||||
def get_ctx_date(ctx) -> TimestampWithTimezone: | |||||
Done Inline ActionsI don't understand the way offsets is managed here. Why is it always negative? the negative_utc flag is supposed to be set true only for very rare cases coming from git, IIRC, in which one can use a date with UTC-0000 (which make no sense, but is possible and used for hash computation in git). douardda: I don't understand the way offsets is managed here. Why is it always negative? the… | |||||
"""Return a changset timestamp. """ | |||||
(timestamp, offset) = ctx.date() | |||||
# TODO timestamp seems to always be negative. | |||||
Done Inline Actionssame comment as above, you can build the snapshot directly instead of using from_dict: for ...: branches[name] = SnapshotBranch( target=target, target_type=TargetType.REVISION, ) ... snapshot = Snapshot(branches=branches) should automatically compute the id vlorentz: same comment as above, you can build the snapshot directly instead of using from_dict:
```
for… | |||||
Done Inline ActionsSame question about why we need a two steps approach here. marmoute: Same question about why we need a two steps approach here. | |||||
# Some investigation is needed but it works. | |||||
result = TimestampWithTimezone( | |||||
timestamp=Timestamp(seconds=int(timestamp), microseconds=0), | |||||
offset=-(offset // 60), | |||||
negative_utc=(offset == 0), | |||||
) | |||||
return result | |||||
def build_hashes_cache(repo): | |||||
"""Build a cache of every file hash in advance. | |||||
TODO: use lru cache at when incremental loading is implemented. | |||||
Done Inline ActionsWe probably want this to be a function on the object. This will abstract the in memory cache and allow collaboration with the SWH database when we implement incremental loading in the future. marmoute: We probably want this to be a function on the object. This will abstract the in memory cache… | |||||
Done Inline ActionsI think that this feedback about self._revision_nodeid_to_swhid[parent_hg_nodeid] still apply. Am I missing something? marmoute: I think that this feedback about `self._revision_nodeid_to_swhid[parent_hg_nodeid]` still apply. | |||||
Done Inline ActionsI thought it was about the whole parent block hence why I added self.get_revision_parents(rev_ctx). acezar: I thought it was about the whole parent block hence why I added `self.get_revision_parents… | |||||
""" | |||||
result = {} | |||||
Done Inline ActionsUnless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo (including file content) in RAM. This is a problem. douardda: Unless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo… | |||||
for rev in repo: | |||||
result[rev] = {} | |||||
ctx = repo[rev] | |||||
manifest = ctx.manifest() | |||||
Done Inline Actionsstill not satisfied by this "hide this under the carpet" path douardda: still not satisfied by this "hide this under the carpet" path | |||||
Done Inline Actions+1 marmoute: +1 | |||||
for filepath in manifest: | |||||
fctx = ctx[filepath] | |||||
data = fctx.data() | |||||
hashes = MultiHash.from_data(data).digest() | |||||
result[rev][filepath] = hashes | |||||
return result | |||||
def path_split_all(path): | |||||
"""Split all parts of a path. """ | |||||
parts = [] | |||||
Done Inline ActionsIn practice, the current setup is not a cache, since it must be fully populated for the code qui already went over to work. So this is a mapping. In addition, the as I said before, the current execution order is not going to be usable as a base for future work. And we need a more revision centric approach (cf previous email). In addition, the current cache/mapping structure is wrong. What we need is a`cache[filename][filerevision(or filenode)] → SWHID cache. So that we don't have to recompute the hash of everything for every revision. The current cache[changelog-rev][filename] → SWHID` is mostly loading all revision+manifest in memory is does not add much information compared to what is on disk. marmoute: In practice, the current setup is not a cache, since it must be fully populated for the code… | |||||
head, tail = os.path.split(path) | |||||
while tail != b"": | |||||
Done Inline ActionsAs already pointed, how will this behave with say mozilla or a linux repo? douardda: As already pointed, how will this behave with say mozilla or a linux repo? | |||||
Done Inline ActionsThe current plan (not implemented in this Diff), is to have a loader that does: for rev in repo: m = manifest for file in repo.manifest(rev): file_id = store_file(repo.file_content(file)) m.add_file(filename, file_id) manifest_id = store_manifest(m) store_rev(…, manifest_id) Then, by adding a simple LRU cache on the file_id = store_file(repo.file_content(file)) line, we should get something capable to injest large repository. And then, we can start using manifest difference between the parent and the child, to directly re-use file_id and manifest computation from the parent. That should bring us to a reasonable complexity and okay-ish runtime for large repository. marmoute: The current plan (not implemented in this Diff), is to have a loader that does:
```
for rev in… | |||||
parts.append(tail) | |||||
head, tail = os.path.split(head) | |||||
return list(reversed(parts)) | |||||
Done Inline ActionsIt would be nice to document what a SWH's Release maps to in terms of Mercurial concept. marmoute: It would be nice to document what a SWH's Release maps to in terms of Mercurial concept. | |||||
def get_tags_rev(repo) -> Dict[str, Dict]: | |||||
Done Inline ActionsI'm a bit worried how this is supposed to work: do we really recompute the hashes of file already seen (eg. a file present from rev 1 and then unmodified for all the revisions) in each revision? Or did I miss something? douardda: I'm a bit worried how this is supposed to work: do we really recompute the hashes of file… | |||||
Done Inline Actionscf my previous reply (currently, we do) marmoute: cf my previous reply (currently, we do) | |||||
"""Map tags to their respective rev and target node """ | |||||
result: Dict[str, Dict] = {} | |||||
hgtags_log = repo.file(b".hgtags") | |||||
for trev in hgtags_log: | |||||
# TODO: handle edge cases for linkrev | |||||
rev = hgtags_log.linkrev(trev) | |||||
Done Inline ActionsWe need to keep the tip tags because the previous loader was loading it ? Or is there another reason ? (It would be nice if the doc was clearer about that.) marmoute: We need to keep the tip tags because the previous loader was loading it ? Or is there another… | |||||
Done Inline Actionswas a typo: tip must be ignored. Because as said: A release correspond to a user defined tag and tip is not user defined. acezar: was a typo: `tip must be ignored`. Because as said: `A release correspond to a user defined… | |||||
oldfnodes = [ | |||||
hgtags_log.node(p) | |||||
for p in hgtags_log.parentrevs(trev) | |||||
if p != mercurial.node.nullrev | |||||
] | |||||
newfnodes = [hgtags_log.node(trev)] | |||||
changes = tagsmod.difftags(repo.ui, repo, oldfnodes, newfnodes) | |||||
Done Inline ActionsWhy can't we simply split on os.sep ? If we can't we need to document why. marmoute: Why can't we simply split on ` os.sep` ? If we can't we need to document why. | |||||
for tag, old, new in changes: | |||||
if new is None: # The tag has been removed | |||||
del result[tag] | |||||
else: # The tag has been added of moved | |||||
result[tag] = { | |||||
"rev": rev, # rev which added the tag | |||||
Done Inline ActionsSame here, if would be nice to Document what's a SWH's content map to in terms of Mercurial concept (here: a file revision + some other stuff) marmoute: Same here, if would be nice to Document what's a SWH's content map to in terms of Mercurial… | |||||
"target": new, # targeted node | |||||
Done Inline ActionsWhy is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial repo object? douardda: Why is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial… | |||||
} | |||||
return result | |||||
class TreeElement(ABC): | |||||
"""Interface that every element of the tree must respect. | |||||
""" | |||||
@abstractmethod | |||||
def as_entry_dict(self) -> Dict[str, Any]: | |||||
"""Return the element as a dict suitable for `Directory.from_dict`. | |||||
Done Inline ActionsYou should add a small inline comment to mention that caching is simple and will come in the next revision. marmoute: You should add a small inline comment to mention that caching is simple and will come in the… | |||||
Done Inline Actions(gentle ping) marmoute: (gentle ping) | |||||
""" | |||||
def directories(self) -> Iterator[Directory]: | |||||
"""Yield all directories contained in the element. | |||||
""" | |||||
yield from [] | |||||
class TreeDirectory(TreeElement): | |||||
"""Represent a directory structure. | |||||
Done Inline ActionsSince we don't use data/author on tag (release) yet. I recommend we split this in a different changeset. marmoute: Since we don't use data/author on tag (release) yet. I recommend we split this in a different… | |||||
Done Inline ActionsWhat does to_model do ? Is it dropping the revision to the binary content of the file ? Why do we need the "isinstance" call for ? marmoute: What does to_model do ? Is it dropping the revision to the binary content of the file ? Why do… | |||||
Done Inline Actionsto_model converts from_disk.Content to model.BaseContent but storage.content_add only accepts model.Content. In our case we should only have model.Content from to_model anything else is actually not handled and should be an error. acezar: `to_model` converts `from_disk.Content` to `model.BaseContent` but `storage.content_add` only… | |||||
Done Inline ActionsI see. What about the binary content of file. when is it dropped ? marmoute: I see.
What about the binary content of file. when is it dropped ? | |||||
Done Inline ActionsMade sure that only the necessary data is returned at the en of the method. Garbage collector should cleanup thing if storage does not hold reference until flush. acezar: Made sure that only the necessary data is returned at the en of the method. Garbage collector… | |||||
Done Inline ActionsSure, but which part is dropping the reference to the actual file data ? The data are passed into: content_data, which is passed into Content, which is passed to ModelContent which still hold the data because self.storage.content_add is called on it. marmoute: Sure, but which part is dropping the reference to the actual file data ?
The data are passed… | |||||
Done Inline ActionsNever mind I just noticed that we return a new object at the end of the method: # Here we make sure to return only necessary data. return Content({"sha1_git": content.hash, "perms": perms}) Is this code new, or did I missed it on the first pass ? marmoute: Never mind I just noticed that we return a new object at the end of the method:
```
#… | |||||
Done Inline ActionsThis is indeed new code. marmoute: This is indeed new code. | |||||
Done Inline ActionsNew in this diff, but was already in the next revision: D4541 (without the comment). acezar: New in this diff, but was already in the next revision: D4541 (without the comment). | |||||
It is used to compute recursively compute the sha1_git of all the elements of the | |||||
tree. | |||||
When incremental loading will be implemented, it will be used to recaclulate only | |||||
the changed parts. | |||||
""" | |||||
def __init__(self, name: Optional[bytes]) -> None: | |||||
self._name = name | |||||
self._files: Dict[bytes, TreeFile] = {} | |||||
Done Inline ActionsSame here, please document what a Directory maps to in terms of Mercurial concepts. marmoute: Same here, please document what a Directory maps to in terms of Mercurial concepts. | |||||
self._dirs: Dict[bytes, TreeDirectory] = {} | |||||
self._sha1_git: Optional[bytes] = None | |||||
def add_file(self, path: bytes, perms: int, sha1_git: bytes) -> None: | |||||
"""Add a file to the tree by its full path. | |||||
Done Inline ActionsWhy do we need to implement our own things for this ? doesn't SWH already have an utility for this ? If we need something special here, you need to document what it is? And if something special is needed, we should consider implementing it in the upstream module instead. marmoute: Why do we need to implement our own things for this ? doesn't SWH already have an utility for… | |||||
""" | |||||
path, filename = os.path.split(path) | |||||
Done Inline Actions"compute recursively compute " douardda: "compute recursively compute " | |||||
parts = path_split_all(path) | |||||
current_dir = self | |||||
Done Inline Actionstypos:
Also I don't get how the "recalculate only" is foreseen here. douardda: typos:
- "when [...] is implemented, it will [...]"
- "recaclulate"
Also I don't get how the… | |||||
for part in parts: | |||||
current_dir = current_dir._add_tree(part) | |||||
current_dir._add_file(filename, perms, sha1_git) | |||||
def _add_tree(self, name: bytes) -> "TreeDirectory": | |||||
"""Add a tree to the tree by is name. | |||||
""" | |||||
Done Inline Actions.pop(0) is quite expensive, could we reverse the list andusing .pop() ? if not, lets use a better datastructure for this. marmoute: `.pop(0)` is quite expensive, could we reverse the list andusing `.pop()` ? if not, lets use a… | |||||
Done Inline Actionsgentle ping marmoute: gentle ping | |||||
if name not in self._dirs: | |||||
Done Inline Actionsprefer the type Sha1Git (from swh.model.model), it's an alias of bytes vlorentz: prefer the type `Sha1Git` (from swh.model.model), it's an alias of `bytes` | |||||
self._dirs[name] = TreeDirectory(name) | |||||
return self._dirs[name] | |||||
def _add_file(self, name: bytes, perms: int, sha1_git: bytes) -> None: | |||||
"""Add a file to the tree by is name. | |||||
""" | |||||
if name in self._files: | |||||
raise Exception(f"name {name.decode()} already exists") | |||||
Done Inline ActionsWhat is this class about ? marmoute: What is this class about ? | |||||
if name in self._dirs: | |||||
raise Exception(f"name {name.decode()} already is an existing directory") | |||||
self._files[name] = TreeFile(name, perms, sha1_git) | |||||
def __str__(self): | |||||
"""Display the tree (for debug purpose only). | |||||
""" | |||||
name = self._name.decode() if self._name else "/" | |||||
files = textwrap.indent("\n".join(map(str, self._files.values())), prefix=" ") | |||||
dirs = textwrap.indent("\n".join(map(str, self._dirs.values())), prefix=" ") | |||||
return "\n".join([f"{name} ({self.sha1_git().hex()})", files, dirs]) | |||||
def sha1_git(self) -> bytes: | |||||
"""Compute the hash of the tree. | |||||
""" | |||||
Done Inline Actionsyou may want to use errors='replace' here, in case the name can't be decoded vlorentz: you may want to use `errors='replace'` here, in case the name can't be decoded | |||||
if self._sha1_git is None: | |||||
Done Inline Actionswhat is different compared to the base class ? marmoute: what is different compared to the base class ? | |||||
Done Inline ActionsAs we discussed before, we need to better define Archive here. marmoute: As we discussed before, we need to better define Archive here. | |||||
self._sha1_git = hash_to_bytes( | |||||
Directory.compute_hash({"entries": self._entries()}) | |||||
) | |||||
return self._sha1_git | |||||
def _entries(self) -> List[Dict[str, Any]]: | |||||
"""List entries of the tree. | |||||
""" | |||||
file_entries = [f.as_entry_dict() for f in self._files.values()] | |||||
Done Inline ActionsIt's unclear to me what this tree structure is really for. Couldn't swh.model.from_disk data structure be used for this instead of reimplementing it? douardda: It's unclear to me what this tree structure is really for. Couldn't `swh.model.from_disk` data… | |||||
dir_entries = [d.as_entry_dict() for d in self._dirs.values()] | |||||
Done Inline Actionssame question: what is different compared to the base class ? marmoute: same question: what is different compared to the base class ? | |||||
return file_entries + dir_entries | |||||
def as_entry_dict(self) -> Dict[str, Any]: | |||||
"""Return the tree as a dict suitable for `Directory.from_dict`. | |||||
""" | |||||
return { | |||||
"type": "dir", | |||||
Done Inline Actionsplease define the data we are trying to acquire here. marmoute: please define the data we are trying to acquire here. | |||||
"perms": DIR_PERM, | |||||
"name": self._name, | |||||
"target": self.sha1_git(), | |||||
} | |||||
def directories(self) -> Iterator[Directory]: | |||||
"""Yield all directories contained in the tree. | |||||
""" | |||||
for item in self._dirs.values(): | |||||
yield from item.directories() | |||||
yield Directory.from_dict( | |||||
{"id": self.sha1_git(), "entries": self._entries(),} | |||||
) | |||||
class TreeFile(TreeElement): | |||||
"""Represent a file in a tree. | |||||
""" | |||||
def __init__(self, name: bytes, perms: int, sha1_git: bytes) -> None: | |||||
self._name = name | |||||
self._perms = perms | |||||
self._sha1_git = sha1_git | |||||
def __str__(self): | |||||
"""Display the file (for debug purpose only). | |||||
""" | |||||
return f"{self._name.decode()} ({self._sha1_git.hex()})" | |||||
def as_entry_dict(self) -> Dict[str, Any]: | |||||
"""Return the file as a dict suitable for `Directory.from_dict`. | |||||
""" | |||||
return { | |||||
"type": "file", | |||||
"perms": self._perms, | |||||
"name": self._name, | |||||
"target": self._sha1_git, | |||||
} | |||||
def author_dict_from_str(author: bytes) -> Dict[str, bytes]: | |||||
result = Person.from_fullname(author).to_dict() | |||||
# git requires a space between name and email. Other wise the hash differs | |||||
# the fullname is used when present in the hash function | |||||
result["fullname"] = result["name"] + b" <" + result["email"] + b">" | |||||
return result | |||||
if __name__ == "__main__": | |||||
import logging | |||||
import click | |||||
logging.basicConfig( | |||||
Done Inline ActionsMercurial author field are free form. Assuming git formatted author field is going to create issue. What do we actually need to store in SWH ? marmoute: Mercurial author field are free form. Assuming git formatted author field is going to create… | |||||
Done Inline Actionsrevision hash use the freeform fullname when provided and we create author using Person.from_fullname acezar: revision hash use the freeform fullname when provided and we create author using `Person. | |||||
Done Inline ActionsIs this documented ? marmoute: Is this documented ? | |||||
Done Inline ActionsAdded comment acezar: Added comment | |||||
level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" | |||||
) | |||||
@click.command() | |||||
@click.option("--origin-url", help="origin url") | |||||
@click.option("--hg-directory", help="Path to mercurial repository to load") | |||||
@click.option("--visit-date", default=None, help="Visit date") | |||||
def main(origin_url, hg_directory, visit_date): | |||||
if not visit_date: | |||||
visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | |||||
return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date) | |||||
main() | |||||
Done Inline Actionswhat? a function located after the if __name__ == "__main__"? douardda: what? a function located //after// the `if __name__ == "__main__"`? | |||||
Done Inline Actions(please move this before the if __name__ == "__main__" block.) The semantic of this function needs to be document, What is the actual information we fetch, and why do we needs it. marmoute: (please move this before the `if __name__ == "__main__"` block.)
The semantic of this function… |
We should add a comment about the fact the internal Mercurial API is not guaranteed to be stable. In practice I do not expect the API we use to change much, but we better warn the future people looking into this.