Differential D3435 Diff 12577 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

This file was added.

				import os
				import textwrap
				from abc import ABC, abstractmethod
				from datetime import datetime
				from typing import Any, Dict, Iterable, Iterator, List, Optional

				import mercurial.ui # type: ignore
				from mercurial import hg
				from mercurial import tags as tagsmod

				from swh.loader.core.loader import DVCSLoader
				marmouteUnsubmitted Done Inline Actions We should add a comment about the fact the internal Mercurial API is not guaranteed to be stable. In practice I do not expect the API we use to change much, but we better warn the future people looking into this. marmoute: We should add a comment about the fact the internal Mercurial API is not guaranteed to be…
				marmouteUnsubmitted Done Inline Actions This comment still apply, (but the line moved a bit further down the file) marmoute: This comment still apply, (but the line moved a bit further down the file)
				from swh.model.hashutil import MultiHash, hash_to_bytes
				from swh.model.model import (
				BaseContent,
				Content,
				Directory,
				ObjectType,
				Origin,
				Person,
				Release,
				Revision,
				RevisionType,
				Snapshot,
				TargetType,
				Timestamp,
				TimestampWithTimezone,
				)

				DIR_PERM = 0o040000
				FLAG_PERMS = {
				b"l": 0o120000, # symlink flag
				b"x": 0o100755, # exec flag
				b"": 0o100644, # no flag
				} # type: Dict[bytes, int]


				class HgLoaderFromDisk(DVCSLoader):
				"""Load a mercurial repository from a cloned mercurial directory.
				vlorentzUnsubmitted Done Inline Actions You can use the enum values in `swh.model.from_disk.DentryPerms` vlorentz: You can use the enum values in `swh.model.from_disk.DentryPerms`
				marmouteUnsubmitted Done Inline Actions This one still applies too, if I understand correctly. marmoute: This one still applies too, if I understand correctly.

				"""

				CONFIG_BASE_FILENAME = "loader/mercurial"

				visit_type = "hg"

				def __init__(
				self,
				url: str,
				directory: str, # Path to a cloned mercurial repository
				logging_class="swh.loader.mercurial.Loader",
				visit_date: Optional[datetime] = None,
				douarddaUnsubmitted Done Inline Actions why is this a comment rather than a properly crafted docstring? douardda: why is this a comment rather than a properly crafted docstring?
				marmouteUnsubmitted Done Inline Actions Having a full docstring to document the constructor argument would be nice. marmoute: Having a full docstring to document the constructor argument would be nice.
				marmouteUnsubmitted Done Inline Actions This could use a small docstring. marmoute: This could use a small docstring.
				config: Optional[Dict[str, Any]] = None,
				):
				douarddaUnsubmitted Done Inline Actions `s/data/date/` I guess douardda: `s/data/date/` I guess
				super().__init__(logging_class=logging_class, config=config or {})
				self.origin_url = url
				self.visit_date = visit_date
				self.directory = directory

				def prepare_origin_visit(self, args, *kwargs) -> None:
				"""First step executed by the loader to prepare origin and visit
				douarddaUnsubmitted Done Inline Actions why do we need to use urlsplit on a directory path? If we expect the `directory` argument to (possibly) be a file:// url, then it should be told explicitly. douardda: why do we need to use urlsplit on a directory path? If we expect the `directory` argument to…
				marmouteUnsubmitted Done Inline Actions Since the loader is "from disk" assuming that this is a local path seems fine. But why do we need urlsplit then ? If we have to use urlsplit, can we explicitly check that the other value are the one we expect for a local path ? marmoute: Since the loader is "from disk" assuming that this is a local path seems fine. But why do we…
				references. Set/update self.origin, and
				optionally self.origin_url, self.visit_date.

				"""
				self.origin = Origin(url=self.origin_url)

				def prepare(self, args, *kwargs):
				marmouteUnsubmitted Done Inline Actions You should document the class purpose and scope. marmoute: You should document the class purpose and scope.
				"""Second step executed by the loader to prepare some state needed by
				the loader.

				"""
				douarddaUnsubmitted Done Inline Actions please explain why the "unfiltered" is there douardda: please explain why the "unfiltered" is there
				douarddaUnsubmitted Done Inline Actions that question is still not answered (however the part of the hunk it relates to is a few lines below) douardda: that question is still not answered (however the part of the hunk it relates to is a few lines…
				ui = mercurial.ui.ui.load()
				self.repo = hg.repository(ui, self.directory.encode()).unfiltered()
				self.hashes_cache = build_hashes_cache(self.repo)
				self.tags_rev = get_tags_rev(self.repo)

				def save_data(self) -> None:
				"""Save the data associated to the current load"""
				# TODO may be implemented with incremental loading
				marmouteUnsubmitted Done Inline Actions small nits: could simply be `visit_and_status[1].snapshot` marmoute: small nits: could simply be `visit_and_status[1].snapshot`
				return

				def fetch_data(self) -> bool:
				"""Fetch the data from the data source."""
				return False # No data to fetch since we use an already cloned repo

				def has_contents(self) -> bool:
				"""Checks whether we need to load contents"""
				# TODO may be implemented with incremental loading
				douarddaUnsubmitted Done Inline Actions ... here: explain `unfiltered()` douardda: ... here: explain `unfiltered() `
				marmouteUnsubmitted Done Inline Actions The `unfiltered` is lifting any possible filtering (eg: hidden changeset) so that we import all available contents. This is good when importing revision, but we should probably go back to the `visible` set when we start to read branch head information. marmoute: The `unfiltered` is lifting any possible filtering (eg: hidden changeset) so that we import all…
				return True

				def get_contents(self) -> Iterable[BaseContent]:
				marmouteUnsubmitted Done Inline Actions Maybe add a comment about visibility here. marmoute: Maybe add a comment about visibility here.
				"""Get the contents that need to be loaded"""
				_hashes = []
				for rev in self.hashes_cache:
				ctx = self.repo[rev]
				marmouteUnsubmitted Done Inline Actions What is this function about? And why cannot we have it now? marmoute: What is this function about? And why cannot we have it now?
				marmouteUnsubmitted Done Inline Actions note: the docstring seems to need an update. marmoute: note: the docstring seems to need an update.

				for filename in self.hashes_cache[rev]:
				fctx = ctx[filename]
				data = fctx.data()
				marmouteUnsubmitted Done Inline Actions But the local clone might still need update, right? What would be the final state of this can we document the TODO here? marmoute: But the local clone might still need update, right? What would be the final state of this can…
				hashes = self.hashes_cache[rev][filename]
				_hashes.append(hashes["sha1_git"])

				yield Content(
				data=data, status="visible", length=len(data), **hashes,
				marmouteUnsubmitted Done Inline Actions Same question (mostly for me to understand the situation better) what is this function about and what is the long terme goal for it? marmoute: Same question (mostly for me to understand the situation better) what is this function about…
				)

				def has_directories(self) -> bool:
				"""Checks whether we need to load directories"""
				douarddaUnsubmitted Done Inline Actions What's this `_hashes` for? douardda: What's this `_hashes` for?
				marmouteUnsubmitted Done Inline Actions +1? marmoute: +1?
				# TODO may be implemented with incremental loading
				douarddaUnsubmitted Done Inline Actions Also, why don't we build the `hashes_caches` during this step? douardda: Also, why don't we build the `hashes_caches` during this step?
				marmouteUnsubmitted Done Inline Actions So if I understand correctly, this function is fetching all the "files" content, right ? And we do so by iterating over all revisions, and all files in each of these revisions? Why are we getting the revision from `self.hashes_cache` instead of getting them the repository? Regarding the general approach, uploading all the files first is "good enough" for a first rough version to validate that we generate the right data for SWH, but is not going to fly for any real size repository. For them, we will need to move from the "all files → all manifests → all revisions" approach to move to a "for each rev: all files in rev → manifest of the rev → rev". We need to be explicite about that. Either in this changeset description or in the comments (or maybe both). marmoute: So if I understand correctly, this function is fetching all the "files" content, right ? And we…
				return True

				def get_directories(self) -> Iterable[Directory]:
				marmouteUnsubmitted Done Inline Actions Same question, why are we getting the filename from the hashcache ? marmoute: Same question, why are we getting the filename from the hashcache ?
				"""Get the directories that need to be loaded"""
				self.rev_dir_hashes: Dict[bytes, bytes] = {}
				for rev in self.hashes_cache:
				tree = TreeDirectory(None)

				ctx = self.repo[rev]
				for filepath in self.hashes_cache[rev]:
				fctx = ctx[filepath]
				marmouteUnsubmitted Done Inline Actions I find it a bit weird to have the _get_revisions store all the underlying content but the revision itself. Maybe we should return already store revision there. What do you think ? marmoute: I find it a bit weird to have the _get_revisions store all the underlying content but the…
				tree.add_file(
				marmouteUnsubmitted Done Inline Actions small typo: "An filtered" → "A filtered" marmoute: small typo: "An filtered" → "A filtered"
				path=filepath,
				douarddaUnsubmitted Done Inline Actions shouldn't the `repo_directory` attribute be "declared" here too? douardda: shouldn't the `repo_directory` attribute be "declared" here too?
				perms=FLAG_PERMS[fctx.flags()],
				sha1_git=self.hashes_cache[rev][filepath]["sha1_git"],
				)
				marmouteUnsubmitted Done Inline Actions Same question as for the other method that need to be implemented for incremental loading. marmoute: Same question as for the other method that need to be implemented for incremental loading.

				yield from tree.directories()
				self.rev_dir_hashes[rev] = tree.sha1_git()
				douarddaUnsubmitted Done Inline Actions I may be missing something but why is this a property? why not just initialize a `self.repo` in the `__fetch_data__` method (where the `repo_directory` is initialized)? douardda: I may be missing something but why is this a property? why not just initialize a `self.repo` in…
				acezarAuthorUnsubmitted Done Inline Actions You're right it has been added for an old need and is no more necessary. acezar: You're right it has been added for an old need and is no more necessary.

				def has_revisions(self) -> bool:
				marmouteUnsubmitted Done Inline Actions Same question (and I assume, same answer) marmoute: Same question (and I assume, same answer)
				"""Checks whether we need to load revisions"""
				# TODO may be implemented with incremental loading
				return True

				marmouteUnsubmitted Done Inline Actions (idem) marmoute: (idem)
				def get_revisions(self) -> Iterable[Revision]:
				"""Get the revisions that need to be loaded"""

				# Keep track of calculated revisions sha1_git
				# for release and snapshot generation
				self._node_hashes: Dict[bytes, bytes] = {}

				for rev in self.repo:
				ctx = self.repo[rev]
				parents = tuple(
				[
				marmouteUnsubmitted Done Inline Actions (idem) marmoute: (idem)
				self._node_hashes[p.node()] # revisions are in increasing order so
				# this won't fail
				for p in ctx.parents()
				if p.node() != mercurial.node.nullid
				]
				)

				author = author_dict_from_str(ctx.user())
				rev_date = get_ctx_date(ctx).to_dict()

				marmouteUnsubmitted Done Inline Actions Is this an attribute declaration ? If so, this should be in the `__init__`. marmoute: Is this an attribute declaration ? If so, this should be in the `__init__`.
				revision = {
				"author": author,
				"date": rev_date,
				"committer": author,
				"committer_date": rev_date,
				"type": RevisionType.MERCURIAL.value,
				"directory": self.rev_dir_hashes[rev],
				"message": ctx.description(),
				"metadata": {
				f"hg-extra-{key}": value for key, value in ctx.extra().items()
				},
				"synthetic": False,
				"parents": parents,
				}
				revision["id"] = hash_to_bytes(Revision.compute_hash(revision))
				self._node_hashes[ctx.node()] = revision["id"]
				yield Revision.from_dict(revision)

				def has_releases(self) -> bool:
				"""Checks whether we need to load releases"""
				# TODO may be implemented with incremental loading
				douarddaUnsubmitted Done Inline Actions could we have a bit of comment here (for the non-mercurial expert)? douardda: could we have a bit of comment here (for the non-mercurial expert)?
				return True

				def get_releases(self) -> Iterable[Release]:
				vlorentzUnsubmitted Done Inline Actions also a comment here to explain it's fore backward compat vlorentz: also a comment here to explain it's fore backward compat
				"""Get the releases that need to be loaded"""

				# Keep track of original tags target for snapshot generation
				marmouteUnsubmitted Done Inline Actions Here we seems to assume that the existing repository is up to date. In which case we we already have repository and why don't wee need to pull it ? marmoute: Here we seems to assume that the existing repository is up to date. In which case we we already…
				marmouteUnsubmitted Done Inline Actions (gentle ping) marmoute: (gentle ping)
				self.tags_target = {}

				for (name, nodeid) in self.repo.tags().items():
				target_ctx = self.repo[nodeid]
				self.tags_target[name] = target_ctx.node()
				if name == b"tip":
				continue

				if nodeid == self.tags_rev[name]["target"]:
				marmouteUnsubmitted Done Inline Actions It is still unclear to me when/why we have this special case. Can you clarify that point ? marmoute: It is still unclear to me when/why we have this special case. Can you clarify that point ?
				tag_ctx = self.repo[self.tags_rev[name]["rev"]]

				marmouteUnsubmitted Done Inline Actions Small nits: If they are no other usage than the cli call, maybe pulling is not necessary. marmoute: Small nits: If they are no other usage than the cli call, maybe pulling is not necessary.
				acezarAuthorUnsubmitted Done Inline Actions Removed the TODO acezar: Removed the TODO
				release = {
				"author": author_dict_from_str(tag_ctx.user()),
				"date": get_ctx_date(tag_ctx).to_dict(),
				marmouteUnsubmitted Done Inline Actions we should probably call this `hg_nodeid` for clarity. marmoute: we should probably call this `hg_nodeid` for clarity.
				"name": name,
				"target": self._node_hashes[target_ctx.node()],
				"target_type": ObjectType.REVISION.value,
				douarddaUnsubmitted Done Inline Actions I may be wrong but belive there is no need for computing the revision id "by hand", something like: rev_obj = Revision.from_dict(revision) self._node_hashes[ctx.node()] = rev_obj.id yield rev_obj should work (the id for a `model.HashableObject` is automatically computed in a post-init hook if not provided) douardda: I may be wrong but belive there is no need for computing the revision id "by hand", something…
				vlorentzUnsubmitted Done Inline Actions I even recommend skipping from_dict entirely, so you get type-checking of the arguments: revision = Revision( author=author, date=rev_date, ... ) (and it spares an indirection) vlorentz: I even recommend skipping from_dict entirely, so you get type-checking of the arguments: ```…
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				"message": None,
				"metadata": None,
				"synthetic": False,
				}
				release["id"] = hash_to_bytes(Release.compute_hash(release))

				yield Release.from_dict(release)
				else:
				marmouteUnsubmitted Done Inline Actions Same question, is this a new attribute initialisation ? marmoute: Same question, is this a new attribute initialisation ?
				# TODO: should not happen -> warning or exception.
				pass

				douarddaUnsubmitted Done Inline Actions I'm not very fond of instance attributes being created all over the methods... douardda: I'm not very fond of instance attributes being created all over the methods...
				vlorentzUnsubmitted Done Inline Actions yeah, they should be created in the constructor (with a type annotation) so mypy can detect them vlorentz: yeah, they should be created in the constructor (with a type annotation) so mypy can detect them
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				def get_snapshot(self) -> Snapshot:
				"""Get the snapshot that needs to be loaded"""
				branches: Dict[bytes, Any] = {}
				marmouteUnsubmitted Done Inline Actions Why do we need to yield the (mercurial) nodei here ? marmoute: Why do we need to yield the (mercurial) nodei here ?

				for name, heads in self.repo.branchmap().items():
				# When a branch has multiple heads, the 1st is stored as "branch/{name}"
				# while the next are stored has "wild-branch/{branch}/{target}
				used_names = [] # To know if the first head of branch has used the name
				for head in heads:
				douarddaUnsubmitted Done Inline Actions I don't what's this is doing douardda: I don't what's this is doing
				marmouteUnsubmitted Done Inline Actions If I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the SWH revision ID. marmoute: If I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the…
				ctx = self.repo[head]
				marmouteUnsubmitted Done Inline Actions If we are to store directory one by one, could we simply store them as we build them ? It would make the flow simpler and help us with future optimisation (eg: not recreating directory that already exists) marmoute: If we are to store directory one by one, could we simply store them as we build them ? It would…
				target = self._node_hashes[ctx.node()]
				marmouteUnsubmitted Done Inline Actions This method could use a docstring. marmoute: This method could use a docstring.
				if name in used_names:
				ref = b"wild-branch/" + name + b"/" + target.hex().encode()
				else:
				ref = b"branch/" + name
				marmouteUnsubmitted Done Inline Actions Not doing any optimization here at first is good. However, it would be nice to add comment about which optimization are planned where we plan them. It would also help to make sure we have the right structure for these optimization to come to play. marmoute: Not doing any optimization here at first is good. However, it would be nice to add comment…
				used_names.append(name)
				branches[ref] = {
				"target": target,
				"target_type": TargetType.REVISION.value,
				}

				for name, target in self.tags_target.items():
				# The "tip" tag is a special tag that is more like a head.
				marmouteUnsubmitted Done Inline Actions I suspect the same feedback as for revision apply here, could we create the Release directly instead ? marmoute: I suspect the same feedback as for revision apply here, could we create the Release directly…
				# All heads should have been handled before.
				marmouteUnsubmitted Done Inline Actions I assume the sub directory will be created automatically by the TreeDirectory API, it might be worth a small comment. marmoute: I assume the sub directory will be created automatically by the TreeDirectory API, it might be…
				# There is no need to process "tip".
				marmouteUnsubmitted Done Inline Actions Exception is quite wide. We need something more accurate. If SWH does not have something more accurate already, using at least `ValueError` seem appropriate. marmoute: Exception is quite wide. We need something more accurate. If SWH does not have something more…
				marmouteUnsubmitted Done Inline Actions Why do have this recursivity ? In which case do we expect parent to not be stored because trying to store their children ? marmoute: Why do have this recursivity ? In which case do we expect parent to not be stored because…
				if name != b"tip":
				ref = b"tag/" + name
				branches[ref] = {
				"target": self._node_hashes[target],
				douarddaUnsubmitted Done Inline Actions it would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped into swh's (git-like) one here. douardda: it would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped…
				marmouteUnsubmitted Done Inline Actions +1, this is a very needed, because this is going to be one of the main question that will arrive from any mapping. marmoute: +1, this is a very needed, because this is going to be one of the main question that will…
				"target_type": TargetType.REVISION.value,
				}
				marmouteUnsubmitted Done Inline Actions I would move these utility higher in the module to have them defined before the code that use them. marmoute: I would move these utility higher in the module to have them defined before the code that use…

				snapshot = {"branches": branches}
				snapshot["id"] = hash_to_bytes(Snapshot.compute_hash({"branches": branches}))
				douarddaUnsubmitted Done Inline Actions why is this for loop needed? why not just `self.tags_target['tip']` ? douardda: why is this for loop needed? why not just `self.tags_target['tip']` ?
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				return Snapshot.from_dict(snapshot)

				marmouteUnsubmitted Done Inline Actions Should we pass rev_ctx here instead ? marmoute: Should we pass rev_ctx here instead ?
				marmouteUnsubmitted Done Inline Actions Looks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ? marmoute: Looks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ?
				acezarAuthorUnsubmitted Done Inline Actions it was about passing context rather than the `nodeid` and ask the context again to the repo in the method. (was on `self.store_directories(hg_nodeid)`) acezar: it was about passing context rather than the `nodeid` and ask the context again to the repo in…
				def load_status(self):
				"""Detailed loading status.

				Defaults to logging an eventful load.

				Returns: a dictionary that is eventually passed back as the task's
				result to the scheduler, allowing tuning of the task recurrence
				mechanism.
				"""
				marmouteUnsubmitted Done Inline Actions Why do we need a HEAD ? and why is it tip ? marmoute: Why do we need a HEAD ? and why is it tip ?
				marmouteUnsubmitted Done Inline Actions The questions still stand. marmoute: The questions still stand.
				acezarAuthorUnsubmitted Done Inline Actions Added comment stating that it match the historical implementation. acezar: Added comment stating that it match the historical implementation.
				return {
				marmouteUnsubmitted Done Inline Actions I am a bit warry about the reuse of internal attribute to fetch these information. Especially because they semantic is not documented so far. marmoute: I am a bit warry about the reuse of internal attribute to fetch these information. Especially…
				"status": "eventful",
				}


				marmouteUnsubmitted Done Inline Actions "incremental loading" is probably an overloaded ambiguous terms, lets stop using it. What's you plan to change the way data are fed to this class when we implement something more efficient than recreating the full manifest for each revision. marmoute: - "incremental loading" is probably an overloaded ambiguous terms, lets stop using it.
				def get_ctx_date(ctx) -> TimestampWithTimezone:
				douarddaUnsubmitted Done Inline Actions I don't understand the way offsets is managed here. Why is it always negative? the `negative_utc` flag is supposed to be set true only for very rare cases coming from git, IIRC, in which one can use a date with UTC-0000 (which make no sense, but is possible and used for hash computation in git). douardda: I don't understand the way offsets is managed here. Why is it always negative? the…
				"""Return a changset timestamp. """
				(timestamp, offset) = ctx.date()
				# TODO timestamp seems to always be negative.
				vlorentzUnsubmitted Done Inline Actions same comment as above, you can build the snapshot directly instead of using from_dict: for ...: branches[name] = SnapshotBranch( target=target, target_type=TargetType.REVISION, ) ... snapshot = Snapshot(branches=branches) should automatically compute the id vlorentz: same comment as above, you can build the snapshot directly instead of using from_dict: ``` for…
				marmouteUnsubmitted Done Inline Actions Same question about why we need a two steps approach here. marmoute: Same question about why we need a two steps approach here.
				# Some investigation is needed but it works.
				result = TimestampWithTimezone(
				timestamp=Timestamp(seconds=int(timestamp), microseconds=0),
				offset=-(offset // 60),
				negative_utc=(offset == 0),
				)
				return result


				def build_hashes_cache(repo):
				"""Build a cache of every file hash in advance.

				TODO: use lru cache at when incremental loading is implemented.
				marmouteUnsubmitted Done Inline Actions We probably want this to be a function on the object. This will abstract the in memory cache and allow collaboration with the SWH database when we implement incremental loading in the future. marmoute: We probably want this to be a function on the object. This will abstract the in memory cache…
				marmouteUnsubmitted Done Inline Actions I think that this feedback about `self._revision_nodeid_to_swhid[parent_hg_nodeid]` still apply. Am I missing something? marmoute: I think that this feedback about `self._revision_nodeid_to_swhid[parent_hg_nodeid]` still apply.
				acezarAuthorUnsubmitted Done Inline Actions I thought it was about the whole parent block hence why I added `self.get_revision_parents(rev_ctx)`. acezar: I thought it was about the whole parent block hence why I added `self.get_revision_parents…
				"""
				result = {}

				douarddaUnsubmitted Done Inline Actions Unless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo (including file content) in RAM. This is a problem. douardda: Unless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo…
				for rev in repo:
				result[rev] = {}
				ctx = repo[rev]
				manifest = ctx.manifest()
				douarddaUnsubmitted Done Inline Actions still not satisfied by this "hide this under the carpet" path douardda: still not satisfied by this "hide this under the carpet" path
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1

				for filepath in manifest:
				fctx = ctx[filepath]
				data = fctx.data()
				hashes = MultiHash.from_data(data).digest()
				result[rev][filepath] = hashes

				return result


				def path_split_all(path):
				"""Split all parts of a path. """
				parts = []
				marmouteUnsubmitted Done Inline Actions In practice, the current setup is not a cache, since it must be fully populated for the code qui already went over to work. So this is a mapping. In addition, the as I said before, the current execution order is not going to be usable as a base for future work. And we need a more revision centric approach (cf previous email). In addition, the current cache/mapping structure is wrong. What we need is a`cache[filename][filerevision(or filenode)] `→ SWHID cache. So that we don't have to recompute the hash of everything for every revision. The current` cache[changelog-rev][filename] → SWHID` is mostly loading all revision+manifest in memory is does not add much information compared to what is on disk. marmoute: In practice, the current setup is not a cache, since it must be fully populated for the code…
				head, tail = os.path.split(path)
				while tail != b"":
				douarddaUnsubmitted Done Inline Actions As already pointed, how will this behave with say mozilla or a linux repo? douardda: As already pointed, how will this behave with say mozilla or a linux repo?
				marmouteUnsubmitted Done Inline Actions The current plan (not implemented in this Diff), is to have a loader that does: for rev in repo: m = manifest for file in repo.manifest(rev): file_id = store_file(repo.file_content(file)) m.add_file(filename, file_id) manifest_id = store_manifest(m) store_rev(…, manifest_id) Then, by adding a simple LRU cache on the `file_id = store_file(repo.file_content(file))` line, we should get something capable to injest large repository. And then, we can start using manifest difference between the parent and the child, to directly re-use file_id and manifest computation from the parent. That should bring us to a reasonable complexity and okay-ish runtime for large repository. marmoute: The current plan (not implemented in this Diff), is to have a loader that does: ``` for rev in…
				parts.append(tail)
				head, tail = os.path.split(head)
				return list(reversed(parts))

				marmouteUnsubmitted Done Inline Actions It would be nice to document what a SWH's Release maps to in terms of Mercurial concept. marmoute: It would be nice to document what a SWH's Release maps to in terms of Mercurial concept.

				def get_tags_rev(repo) -> Dict[str, Dict]:
				douarddaUnsubmitted Done Inline Actions I'm a bit worried how this is supposed to work: do we really recompute the hashes of file already seen (eg. a file present from rev 1 and then unmodified for all the revisions) in each revision? Or did I miss something? douardda: I'm a bit worried how this is supposed to work: do we really recompute the hashes of file…
				marmouteUnsubmitted Done Inline Actions cf my previous reply (currently, we do) marmoute: cf my previous reply (currently, we do)
				"""Map tags to their respective rev and target node """
				result: Dict[str, Dict] = {}
				hgtags_log = repo.file(b".hgtags")

				for trev in hgtags_log:
				# TODO: handle edge cases for linkrev
				rev = hgtags_log.linkrev(trev)

				marmouteUnsubmitted Done Inline Actions We need to keep the tip tags because the previous loader was loading it ? Or is there another reason ? (It would be nice if the doc was clearer about that.) marmoute: We need to keep the tip tags because the previous loader was loading it ? Or is there another…
				acezarAuthorUnsubmitted Done Inline Actions was a typo: `tip must be ignored`. Because as said: `A release correspond to a user defined tag` and tip is not user defined. acezar: was a typo: `tip must be ignored`. Because as said: `A release correspond to a user defined…
				oldfnodes = [
				hgtags_log.node(p)
				for p in hgtags_log.parentrevs(trev)
				if p != mercurial.node.nullrev
				]
				newfnodes = [hgtags_log.node(trev)]

				changes = tagsmod.difftags(repo.ui, repo, oldfnodes, newfnodes)
				marmouteUnsubmitted Done Inline Actions Why can't we simply split on `os.sep` ? If we can't we need to document why. marmoute: Why can't we simply split on ` os.sep` ? If we can't we need to document why.
				for tag, old, new in changes:
				if new is None: # The tag has been removed
				del result[tag]
				else: # The tag has been added of moved
				result[tag] = {
				"rev": rev, # rev which added the tag
				marmouteUnsubmitted Done Inline Actions Same here, if would be nice to Document what's a SWH's content map to in terms of Mercurial concept (here: a file revision + some other stuff) marmoute: Same here, if would be nice to Document what's a SWH's content map to in terms of Mercurial…
				"target": new, # targeted node
				douarddaUnsubmitted Done Inline Actions Why is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial `repo` object? douardda: Why is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial…
				}

				return result


				class TreeElement(ABC):
				"""Interface that every element of the tree must respect.
				"""

				@abstractmethod
				def as_entry_dict(self) -> Dict[str, Any]:
				"""Return the element as a dict suitable for `Directory.from_dict`.
				marmouteUnsubmitted Done Inline Actions You should add a small inline comment to mention that caching is simple and will come in the next revision. marmoute: You should add a small inline comment to mention that caching is simple and will come in the…
				marmouteUnsubmitted Done Inline Actions (gentle ping) marmoute: (gentle ping)
				"""

				def directories(self) -> Iterator[Directory]:
				"""Yield all directories contained in the element.
				"""
				yield from []


				class TreeDirectory(TreeElement):
				"""Represent a directory structure.
				marmouteUnsubmitted Done Inline Actions Since we don't use data/author on tag (release) yet. I recommend we split this in a different changeset. marmoute: Since we don't use data/author on tag (release) yet. I recommend we split this in a different…
				marmouteUnsubmitted Done Inline Actions What does to_model do ? Is it dropping the revision to the binary content of the file ? Why do we need the "isinstance" call for ? marmoute: What does to_model do ? Is it dropping the revision to the binary content of the file ? Why do…
				acezarAuthorUnsubmitted Done Inline Actions `to_model` converts `from_disk.Content` to `model.BaseContent` but `storage.content_add` only accepts `model.Content`. In our case we should only have `model.Content` from `to_model` anything else is actually not handled and should be an error. acezar: `to_model` converts `from_disk.Content` to `model.BaseContent` but `storage.content_add` only…
				marmouteUnsubmitted Done Inline Actions I see. What about the binary content of file. when is it dropped ? marmoute: I see. What about the binary content of file. when is it dropped ?
				acezarAuthorUnsubmitted Done Inline Actions Made sure that only the necessary data is returned at the en of the method. Garbage collector should cleanup thing if `storage` does not hold reference until flush. acezar: Made sure that only the necessary data is returned at the en of the method. Garbage collector…
				marmouteUnsubmitted Done Inline Actions Sure, but which part is dropping the reference to the actual file data ? The data are passed into: `content_data`, which is passed into `Content`, which is passed to `ModelContent` which still hold the data because `self.storage.content_add` is called on it. marmoute: Sure, but which part is dropping the reference to the actual file data ? The data are passed…
				marmouteUnsubmitted Done Inline Actions Never mind I just noticed that we return a new object at the end of the method: # Here we make sure to return only necessary data. return Content({"sha1_git": content.hash, "perms": perms}) Is this code new, or did I missed it on the first pass ? marmoute: Never mind I just noticed that we return a new object at the end of the method: ``` #…
				marmouteUnsubmitted Done Inline Actions This is indeed new code. marmoute: This is indeed new code.
				acezarAuthorUnsubmitted Done Inline Actions New in this diff, but was already in the next revision: D4541 (without the comment). acezar: New in this diff, but was already in the next revision: D4541 (without the comment).

				It is used to compute recursively compute the sha1_git of all the elements of the
				tree.

				When incremental loading will be implemented, it will be used to recaclulate only
				the changed parts.
				"""

				def __init__(self, name: Optional[bytes]) -> None:
				self._name = name
				self._files: Dict[bytes, TreeFile] = {}
				marmouteUnsubmitted Done Inline Actions Same here, please document what a Directory maps to in terms of Mercurial concepts. marmoute: Same here, please document what a Directory maps to in terms of Mercurial concepts.
				self._dirs: Dict[bytes, TreeDirectory] = {}
				self._sha1_git: Optional[bytes] = None

				def add_file(self, path: bytes, perms: int, sha1_git: bytes) -> None:
				"""Add a file to the tree by its full path.
				marmouteUnsubmitted Done Inline Actions Why do we need to implement our own things for this ? doesn't SWH already have an utility for this ? If we need something special here, you need to document what it is? And if something special is needed, we should consider implementing it in the upstream module instead. marmoute: Why do we need to implement our own things for this ? doesn't SWH already have an utility for…
				"""
				path, filename = os.path.split(path)
				douarddaUnsubmitted Done Inline Actions "compute recursively compute " douardda: "compute recursively compute "
				parts = path_split_all(path)

				current_dir = self
				douarddaUnsubmitted Done Inline Actions typos: "when [...] is implemented, it will [...]" "recaclulate" Also I don't get how the "recalculate only" is foreseen here. douardda: typos: - "when [...] is implemented, it will [...]" - "recaclulate" Also I don't get how the…
				for part in parts:
				current_dir = current_dir._add_tree(part)
				current_dir._add_file(filename, perms, sha1_git)

				def _add_tree(self, name: bytes) -> "TreeDirectory":
				"""Add a tree to the tree by is name.
				"""
				marmouteUnsubmitted Done Inline Actions `.pop(0)` is quite expensive, could we reverse the list andusing `.pop()` ? if not, lets use a better datastructure for this. marmoute: `.pop(0)` is quite expensive, could we reverse the list andusing `.pop()` ? if not, lets use a…
				marmouteUnsubmitted Done Inline Actions gentle ping marmoute: gentle ping
				if name not in self._dirs:
				vlorentzUnsubmitted Done Inline Actions prefer the type `Sha1Git` (from swh.model.model), it's an alias of `bytes` vlorentz: prefer the type `Sha1Git` (from swh.model.model), it's an alias of `bytes`
				self._dirs[name] = TreeDirectory(name)
				return self._dirs[name]

				def _add_file(self, name: bytes, perms: int, sha1_git: bytes) -> None:
				"""Add a file to the tree by is name.
				"""
				if name in self._files:
				raise Exception(f"name {name.decode()} already exists")
				marmouteUnsubmitted Done Inline Actions What is this class about ? marmoute: What is this class about ?
				if name in self._dirs:
				raise Exception(f"name {name.decode()} already is an existing directory")
				self._files[name] = TreeFile(name, perms, sha1_git)

				def __str__(self):
				"""Display the tree (for debug purpose only).
				"""
				name = self._name.decode() if self._name else "/"
				files = textwrap.indent("\n".join(map(str, self._files.values())), prefix=" ")
				dirs = textwrap.indent("\n".join(map(str, self._dirs.values())), prefix=" ")
				return "\n".join([f"{name} ({self.sha1_git().hex()})", files, dirs])

				def sha1_git(self) -> bytes:
				"""Compute the hash of the tree.
				"""
				vlorentzUnsubmitted Done Inline Actions you may want to use `errors='replace'` here, in case the name can't be decoded vlorentz: you may want to use `errors='replace'` here, in case the name can't be decoded
				if self._sha1_git is None:
				marmouteUnsubmitted Done Inline Actions what is different compared to the base class ? marmoute: what is different compared to the base class ?
				marmouteUnsubmitted Done Inline Actions As we discussed before, we need to better define Archive here. marmoute: As we discussed before, we need to better define Archive here.
				self._sha1_git = hash_to_bytes(
				Directory.compute_hash({"entries": self._entries()})
				)
				return self._sha1_git

				def _entries(self) -> List[Dict[str, Any]]:
				"""List entries of the tree.
				"""
				file_entries = [f.as_entry_dict() for f in self._files.values()]
				douarddaUnsubmitted Done Inline Actions It's unclear to me what this tree structure is really for. Couldn't `swh.model.from_disk` data structure be used for this instead of reimplementing it? douardda: It's unclear to me what this tree structure is really for. Couldn't `swh.model.from_disk` data…
				dir_entries = [d.as_entry_dict() for d in self._dirs.values()]
				marmouteUnsubmitted Done Inline Actions same question: what is different compared to the base class ? marmoute: same question: what is different compared to the base class ?
				return file_entries + dir_entries

				def as_entry_dict(self) -> Dict[str, Any]:
				"""Return the tree as a dict suitable for `Directory.from_dict`.
				"""
				return {
				"type": "dir",
				marmouteUnsubmitted Done Inline Actions please define the data we are trying to acquire here. marmoute: please define the data we are trying to acquire here.
				"perms": DIR_PERM,
				"name": self._name,
				"target": self.sha1_git(),
				}

				def directories(self) -> Iterator[Directory]:
				"""Yield all directories contained in the tree.
				"""
				for item in self._dirs.values():
				yield from item.directories()

				yield Directory.from_dict(
				{"id": self.sha1_git(), "entries": self._entries(),}
				)


				class TreeFile(TreeElement):
				"""Represent a file in a tree.
				"""

				def __init__(self, name: bytes, perms: int, sha1_git: bytes) -> None:
				self._name = name
				self._perms = perms
				self._sha1_git = sha1_git

				def __str__(self):
				"""Display the file (for debug purpose only).
				"""
				return f"{self._name.decode()} ({self._sha1_git.hex()})"

				def as_entry_dict(self) -> Dict[str, Any]:
				"""Return the file as a dict suitable for `Directory.from_dict`.
				"""
				return {
				"type": "file",
				"perms": self._perms,
				"name": self._name,
				"target": self._sha1_git,
				}


				def author_dict_from_str(author: bytes) -> Dict[str, bytes]:
				result = Person.from_fullname(author).to_dict()
				# git requires a space between name and email. Other wise the hash differs
				# the fullname is used when present in the hash function
				result["fullname"] = result["name"] + b" <" + result["email"] + b">"
				return result


				if __name__ == "__main__":
				import logging

				import click

				logging.basicConfig(
				marmouteUnsubmitted Done Inline Actions Mercurial author field are free form. Assuming git formatted author field is going to create issue. What do we actually need to store in SWH ? marmoute: Mercurial author field are free form. Assuming git formatted author field is going to create…
				acezarAuthorUnsubmitted Done Inline Actions revision hash use the freeform fullname when provided and we create author using `Person.from_fullname` acezar: revision hash use the freeform fullname when provided and we create author using `Person.
				marmouteUnsubmitted Done Inline Actions Is this documented ? marmoute: Is this documented ?
				acezarAuthorUnsubmitted Done Inline Actions Added comment acezar: Added comment
				level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s"
				)

				@click.command()
				@click.option("--origin-url", help="origin url")
				@click.option("--hg-directory", help="Path to mercurial repository to load")
				@click.option("--visit-date", default=None, help="Visit date")
				def main(origin_url, hg_directory, visit_date):
				if not visit_date:
				visit_date = datetime.datetime.now(tz=datetime.timezone.utc)

				return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date)

				main()
				douarddaUnsubmitted Done Inline Actions what? a function located after the `if __name__ == "__main__"`? douardda: what? a function located //after// the `if __name__ == "__main__"`?
				marmouteUnsubmitted Done Inline Actions (please move this before the `if __name__ == "__main__"` block.) The semantic of this function needs to be document, What is the actual information we fetch, and why do we needs it. marmoute: (please move this before the `if __name__ == "__main__"` block.) The semantic of this function…