Differential D3435 Diff 16434 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

This file was added.

				# Copyright (C) 2020 The Software Heritage developers
				# See the AUTHORS file at the top-level directory of this distribution
				# License: GNU General Public License version 3, or any later version
				# See top-level LICENSE file for more information

				import os
				from collections import deque
				from datetime import datetime, timezone
				from shutil import rmtree
				from tempfile import mkdtemp
				from typing import Any, Deque, Dict, Optional, Tuple, Union
				marmouteUnsubmitted Done Inline Actions We should add a comment about the fact the internal Mercurial API is not guaranteed to be stable. In practice I do not expect the API we use to change much, but we better warn the future people looking into this. marmoute: We should add a comment about the fact the internal Mercurial API is not guaranteed to be…
				marmouteUnsubmitted Done Inline Actions This comment still apply, (but the line moved a bit further down the file) marmoute: This comment still apply, (but the line moved a bit further down the file)

				import dateutil

				from swh.core.config import merge_configs
				from swh.loader.core.loader import BaseLoader
				from swh.loader.core.utils import clean_dangling_folders
				from swh.model.from_disk import Content, DentryPerms, Directory
				from swh.model.hashutil import MultiHash, hash_to_bytehex
				from swh.model.model import Content as ModelContent
				from swh.model.model import (
				ObjectType,
				Origin,
				Person,
				Release,
				Revision,
				RevisionType,
				Sha1Git,
				Snapshot,
				SnapshotBranch,
				TargetType,
				TimestampWithTimezone,
				)

				from . import hgutil
				from .archive_extract import tmp_extract
				from .hgutil import HgNodeId

				vlorentzUnsubmitted Done Inline Actions You can use the enum values in `swh.model.from_disk.DentryPerms` vlorentz: You can use the enum values in `swh.model.from_disk.DentryPerms`
				marmouteUnsubmitted Done Inline Actions This one still applies too, if I understand correctly. marmoute: This one still applies too, if I understand correctly.
				FLAG_PERMS = {
				b"l": DentryPerms.symlink,
				b"x": DentryPerms.executable_content,
				b"": DentryPerms.content,
				} # type: Dict[bytes, DentryPerms]
				DEFAULT_CONFIG: Dict[str, Any] = {
				"temp_directory": "/tmp",
				"clone_timeout_seconds": 7200,
				"content_cache_size": 10_000,
				}
				TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"


				douarddaUnsubmitted Done Inline Actions why is this a comment rather than a properly crafted docstring? douardda: why is this a comment rather than a properly crafted docstring?
				marmouteUnsubmitted Done Inline Actions Having a full docstring to document the constructor argument would be nice. marmoute: Having a full docstring to document the constructor argument would be nice.
				marmouteUnsubmitted Done Inline Actions This could use a small docstring. marmoute: This could use a small docstring.
				def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]:
				"""Convert visit date from Optional[Union[str, datetime]] to Optional[datetime].
				douarddaUnsubmitted Done Inline Actions `s/data/date/` I guess douardda: `s/data/date/` I guess

				`HgLoaderFromDisk` accepts `str` and `datetime` as visit date
				while `BaseLoader` only deals with `datetime`.
				"""
				if visit_date is None:
				return None

				douarddaUnsubmitted Done Inline Actions why do we need to use urlsplit on a directory path? If we expect the `directory` argument to (possibly) be a file:// url, then it should be told explicitly. douardda: why do we need to use urlsplit on a directory path? If we expect the `directory` argument to…
				marmouteUnsubmitted Done Inline Actions Since the loader is "from disk" assuming that this is a local path seems fine. But why do we need urlsplit then ? If we have to use urlsplit, can we explicitly check that the other value are the one we expect for a local path ? marmoute: Since the loader is "from disk" assuming that this is a local path seems fine. But why do we…
				if isinstance(visit_date, datetime):
				return visit_date

				if visit_date == "now":
				return datetime.now(tz=timezone.utc)

				if isinstance(visit_date, str):
				marmouteUnsubmitted Done Inline Actions You should document the class purpose and scope. marmoute: You should document the class purpose and scope.
				return dateutil.parser.parse(visit_date)

				return ValueError(f"invalid visit date {visit_date!r}")

				douarddaUnsubmitted Done Inline Actions please explain why the "unfiltered" is there douardda: please explain why the "unfiltered" is there
				douarddaUnsubmitted Done Inline Actions that question is still not answered (however the part of the hunk it relates to is a few lines below) douardda: that question is still not answered (however the part of the hunk it relates to is a few lines…

				class HgDirectory(Directory):
				"""A directory that creates parent directories if missing."""

				def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None:
				if b"/" in path:
				head, tail = path.split(b"/", 1)

				marmouteUnsubmitted Done Inline Actions small nits: could simply be `visit_and_status[1].snapshot` marmoute: small nits: could simply be `visit_and_status[1].snapshot`
				directory = self.get(head)
				if directory is None:
				directory = HgDirectory()
				self[head] = directory

				directory[tail] = value
				else:
				super().__setitem__(path, value)

				douarddaUnsubmitted Done Inline Actions ... here: explain `unfiltered()` douardda: ... here: explain `unfiltered() `
				marmouteUnsubmitted Done Inline Actions The `unfiltered` is lifting any possible filtering (eg: hidden changeset) so that we import all available contents. This is good when importing revision, but we should probably go back to the `visible` set when we start to read branch head information. marmoute: The `unfiltered` is lifting any possible filtering (eg: hidden changeset) so that we import all…

				class HgLoaderFromDisk(BaseLoader):
				"""Load a mercurial repository from a local repository."""
				marmouteUnsubmitted Done Inline Actions Maybe add a comment about visibility here. marmoute: Maybe add a comment about visibility here.

				CONFIG_BASE_FILENAME = "loader/mercurial"

				visit_type = "hg"
				marmouteUnsubmitted Done Inline Actions What is this function about? And why cannot we have it now? marmoute: What is this function about? And why cannot we have it now?
				marmouteUnsubmitted Done Inline Actions note: the docstring seems to need an update. marmoute: note: the docstring seems to need an update.

				def __init__(
				self,
				url: str,
				marmouteUnsubmitted Done Inline Actions But the local clone might still need update, right? What would be the final state of this can we document the TODO here? marmoute: But the local clone might still need update, right? What would be the final state of this can…
				directory: Optional[str] = None,
				logging_class: str = "swh.loader.mercurial.LoaderFromDisk",
				visit_date: Optional[Union[datetime, str]] = None,
				config: Optional[Dict[str, Any]] = None,
				):
				marmouteUnsubmitted Done Inline Actions Same question (mostly for me to understand the situation better) what is this function about and what is the long terme goal for it? marmoute: Same question (mostly for me to understand the situation better) what is this function about…
				"""Initialize the loader.

				Args:
				url: url of the repository.
				douarddaUnsubmitted Done Inline Actions What's this `_hashes` for? douardda: What's this `_hashes` for?
				marmouteUnsubmitted Done Inline Actions +1? marmoute: +1?
				directory: directory of the local repository.
				douarddaUnsubmitted Done Inline Actions Also, why don't we build the `hashes_caches` during this step? douardda: Also, why don't we build the `hashes_caches` during this step?
				marmouteUnsubmitted Done Inline Actions So if I understand correctly, this function is fetching all the "files" content, right ? And we do so by iterating over all revisions, and all files in each of these revisions? Why are we getting the revision from `self.hashes_cache` instead of getting them the repository? Regarding the general approach, uploading all the files first is "good enough" for a first rough version to validate that we generate the right data for SWH, but is not going to fly for any real size repository. For them, we will need to move from the "all files → all manifests → all revisions" approach to move to a "for each rev: all files in rev → manifest of the rev → rev". We need to be explicite about that. Either in this changeset description or in the comments (or maybe both). marmoute: So if I understand correctly, this function is fetching all the "files" content, right ? And we…
				logging_class: class of the loader logger.
				visit_date: visit date of the repository
				config: loader configuration
				marmouteUnsubmitted Done Inline Actions Same question, why are we getting the filename from the hashcache ? marmoute: Same question, why are we getting the filename from the hashcache ?
				"""
				super().__init__(logging_class=logging_class, config=config or {})

				self.config = merge_configs(DEFAULT_CONFIG, self.config)
				self._temp_directory = self.config["temp_directory"]
				self._clone_timeout = self.config["clone_timeout_seconds"]

				self.origin_url = url
				marmouteUnsubmitted Done Inline Actions I find it a bit weird to have the _get_revisions store all the underlying content but the revision itself. Maybe we should return already store revision there. What do you think ? marmoute: I find it a bit weird to have the _get_revisions store all the underlying content but the…
				self.visit_date = parse_visit_date(visit_date)
				marmouteUnsubmitted Done Inline Actions small typo: "An filtered" → "A filtered" marmoute: small typo: "An filtered" → "A filtered"
				self.directory = directory
				douarddaUnsubmitted Done Inline Actions shouldn't the `repo_directory` attribute be "declared" here too? douardda: shouldn't the `repo_directory` attribute be "declared" here too?

				self._repo: Optional[hgutil.Repository] = None
				self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {}
				marmouteUnsubmitted Done Inline Actions Same question as for the other method that need to be implemented for incremental loading. marmoute: Same question as for the other method that need to be implemented for incremental loading.
				self._repo_directory: Optional[str] = None

				def pre_cleanup(self) -> None:
				douarddaUnsubmitted Done Inline Actions I may be missing something but why is this a property? why not just initialize a `self.repo` in the `__fetch_data__` method (where the `repo_directory` is initialized)? douardda: I may be missing something but why is this a property? why not just initialize a `self.repo` in…
				acezarAuthorUnsubmitted Done Inline Actions You're right it has been added for an old need and is no more necessary. acezar: You're right it has been added for an old need and is no more necessary.
				"""As a first step, will try and check for dangling data to cleanup.
				This should do its best to avoid raising issues.
				marmouteUnsubmitted Done Inline Actions Same question (and I assume, same answer) marmoute: Same question (and I assume, same answer)

				"""
				clean_dangling_folders(
				self._temp_directory,
				marmouteUnsubmitted Done Inline Actions (idem) marmoute: (idem)
				pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
				log=self.log,
				)

				def cleanup(self) -> None:
				"""Last step executed by the loader."""
				if self._repo_directory and os.path.exists(self._repo_directory):
				self.log.debug(f"Cleanup up repository {self._repo_directory}")
				rmtree(self._repo_directory)

				def prepare_origin_visit(self, args, *kwargs) -> None:
				marmouteUnsubmitted Done Inline Actions (idem) marmoute: (idem)
				"""First step executed by the loader to prepare origin and visit
				references. Set/update self.origin, and
				optionally self.origin_url, self.visit_date.

				"""
				self.origin = Origin(url=self.origin_url)

				def prepare(self, args, *kwargs) -> None:
				"""Second step executed by the loader to prepare some state needed by
				the loader.
				marmouteUnsubmitted Done Inline Actions Is this an attribute declaration ? If so, this should be in the `__init__`. marmoute: Is this an attribute declaration ? If so, this should be in the `__init__`.

				"""

				def fetch_data(self) -> bool:
				"""Fetch the data from the source the loader is currently loading

				Returns:
				a value that is interpreted as a boolean. If True, fetch_data needs
				to be called again to complete loading.

				"""
				if not self.directory: # no local repository
				self._repo_directory = mkdtemp(
				prefix=TEMPORARY_DIR_PREFIX_PATTERN,
				suffix=f"-{os.getpid()}",
				dir=self._temp_directory,
				)
				self.log.debug(
				f"Cloning {self.origin_url} to {self.directory} "
				f"with timeout {self._clone_timeout} seconds"
				)
				douarddaUnsubmitted Done Inline Actions could we have a bit of comment here (for the non-mercurial expert)? douardda: could we have a bit of comment here (for the non-mercurial expert)?
				hgutil.clone(self.origin_url, self._repo_directory, self._clone_timeout)
				else: # existing local repository
				# Allow to load on disk repository without cloning
				vlorentzUnsubmitted Done Inline Actions also a comment here to explain it's fore backward compat vlorentz: also a comment here to explain it's fore backward compat
				# for testing purpose.
				self._repo_directory = self.directory

				marmouteUnsubmitted Done Inline Actions Here we seems to assume that the existing repository is up to date. In which case we we already have repository and why don't wee need to pull it ? marmoute: Here we seems to assume that the existing repository is up to date. In which case we we already…
				marmouteUnsubmitted Done Inline Actions (gentle ping) marmoute: (gentle ping)
				self._repo = hgutil.repository(self._repo_directory)

				return False

				def store_data(self):
				"""Store fetched data in the database."""
				for rev in self._repo:
				self.store_revision(self._repo[rev])

				marmouteUnsubmitted Done Inline Actions It is still unclear to me when/why we have this special case. Can you clarify that point ? marmoute: It is still unclear to me when/why we have this special case. Can you clarify that point ?
				branch_by_hg_nodeid: Dict[HgNodeId, bytes] = {
				hg_nodeid: name for name, hg_nodeid in hgutil.branches(self._repo).items()
				marmouteUnsubmitted Done Inline Actions Small nits: If they are no other usage than the cli call, maybe pulling is not necessary. marmoute: Small nits: If they are no other usage than the cli call, maybe pulling is not necessary.
				acezarAuthorUnsubmitted Done Inline Actions Removed the TODO acezar: Removed the TODO
				}
				tags_by_name: Dict[bytes, HgNodeId] = self._repo.tags()
				tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {
				marmouteUnsubmitted Done Inline Actions we should probably call this `hg_nodeid` for clarity. marmoute: we should probably call this `hg_nodeid` for clarity.
				hg_nodeid: name for name, hg_nodeid in tags_by_name.items()
				}

				douarddaUnsubmitted Done Inline Actions I may be wrong but belive there is no need for computing the revision id "by hand", something like: rev_obj = Revision.from_dict(revision) self._node_hashes[ctx.node()] = rev_obj.id yield rev_obj should work (the id for a `model.HashableObject` is automatically computed in a post-init hook if not provided) douardda: I may be wrong but belive there is no need for computing the revision id "by hand", something…
				vlorentzUnsubmitted Done Inline Actions I even recommend skipping from_dict entirely, so you get type-checking of the arguments: revision = Revision( author=author, date=rev_date, ... ) (and it spares an indirection) vlorentz: I even recommend skipping from_dict entirely, so you get type-checking of the arguments: ```…
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				snapshot_branches: Dict[bytes, SnapshotBranch] = {}

				for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items():
				tag_name = tags_by_hg_nodeid.get(hg_nodeid)

				# tip is listed in the tags by the mercurial api
				# but its not a tag defined by the user in `.hgtags`
				if tag_name and tag_name != b"tip":
				marmouteUnsubmitted Done Inline Actions Same question, is this a new attribute initialisation ? marmoute: Same question, is this a new attribute initialisation ?
				snapshot_branches[tag_name] = SnapshotBranch(
				target=self.store_release(tag_name, revision_swhid),
				target_type=TargetType.RELEASE,
				douarddaUnsubmitted Done Inline Actions I'm not very fond of instance attributes being created all over the methods... douardda: I'm not very fond of instance attributes being created all over the methods...
				vlorentzUnsubmitted Done Inline Actions yeah, they should be created in the constructor (with a type annotation) so mypy can detect them vlorentz: yeah, they should be created in the constructor (with a type annotation) so mypy can detect them
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				)

				if hg_nodeid in branch_by_hg_nodeid:
				marmouteUnsubmitted Done Inline Actions Why do we need to yield the (mercurial) nodei here ? marmoute: Why do we need to yield the (mercurial) nodei here ?
				name = branch_by_hg_nodeid[hg_nodeid]
				snapshot_branches[name] = SnapshotBranch(
				target=revision_swhid, target_type=TargetType.REVISION,
				)

				# The tip is mapped to `HEAD` to match
				douarddaUnsubmitted Done Inline Actions I don't what's this is doing douardda: I don't what's this is doing
				marmouteUnsubmitted Done Inline Actions If I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the SWH revision ID. marmoute: If I am not mistaken, this is simply mapping the Mercurial revision pointed by the tag to the…
				# the historical implementation
				marmouteUnsubmitted Done Inline Actions If we are to store directory one by one, could we simply store them as we build them ? It would make the flow simpler and help us with future optimisation (eg: not recreating directory that already exists) marmoute: If we are to store directory one by one, could we simply store them as we build them ? It would…
				if hg_nodeid == tags_by_name[b"tip"]:
				marmouteUnsubmitted Done Inline Actions This method could use a docstring. marmoute: This method could use a docstring.
				snapshot_branches[b"HEAD"] = SnapshotBranch(
				target=name, target_type=TargetType.ALIAS,
				)

				marmouteUnsubmitted Done Inline Actions Not doing any optimization here at first is good. However, it would be nice to add comment about which optimization are planned where we plan them. It would also help to make sure we have the right structure for these optimization to come to play. marmoute: Not doing any optimization here at first is good. However, it would be nice to add comment…
				snapshot = Snapshot(branches=snapshot_branches)
				self.storage.snapshot_add([snapshot])

				self.flush()
				self.loaded_snapshot_id = snapshot.id

				def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git:
				"""Return the swhid of a revision given its hg nodeid.
				marmouteUnsubmitted Done Inline Actions I suspect the same feedback as for revision apply here, could we create the Release directly instead ? marmoute: I suspect the same feedback as for revision apply here, could we create the Release directly…

				marmouteUnsubmitted Done Inline Actions I assume the sub directory will be created automatically by the TreeDirectory API, it might be worth a small comment. marmoute: I assume the sub directory will be created automatically by the TreeDirectory API, it might be…
				Args:
				marmouteUnsubmitted Done Inline Actions Exception is quite wide. We need something more accurate. If SWH does not have something more accurate already, using at least `ValueError` seem appropriate. marmoute: Exception is quite wide. We need something more accurate. If SWH does not have something more…
				marmouteUnsubmitted Done Inline Actions Why do have this recursivity ? In which case do we expect parent to not be stored because trying to store their children ? marmoute: Why do have this recursivity ? In which case do we expect parent to not be stored because…
				hg_nodeid: the hg nodeid of the revision.

				Returns:
				the swhid of the revision.
				douarddaUnsubmitted Done Inline Actions it would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped into swh's (git-like) one here. douardda: it would be nice to have a bit of explanation on how mercurial's branch/tag model is mapped…
				marmouteUnsubmitted Done Inline Actions +1, this is a very needed, because this is going to be one of the main question that will arrive from any mapping. marmoute: +1, this is a very needed, because this is going to be one of the main question that will…
				"""
				return self._revision_nodeid_to_swhid[hg_nodeid]
				marmouteUnsubmitted Done Inline Actions I would move these utility higher in the module to have them defined before the code that use them. marmoute: I would move these utility higher in the module to have them defined before the code that use…

				def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
				"""Return the swhids of the parent revisions.
				douarddaUnsubmitted Done Inline Actions why is this for loop needed? why not just `self.tags_target['tip']` ? douardda: why is this for loop needed? why not just `self.tags_target['tip']` ?
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1

				Args:
				marmouteUnsubmitted Done Inline Actions Should we pass rev_ctx here instead ? marmoute: Should we pass rev_ctx here instead ?
				marmouteUnsubmitted Done Inline Actions Looks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ? marmoute: Looks like this phabricator let this commetn drift a bit. Do you remeberwhat is was about ?
				acezarAuthorUnsubmitted Done Inline Actions it was about passing context rather than the `nodeid` and ask the context again to the repo in the method. (was on `self.store_directories(hg_nodeid)`) acezar: it was about passing context rather than the `nodeid` and ask the context again to the repo in…
				hg_nodeid: the hg nodeid of the revision.

				Returns:
				the swhids of the parent revisions.
				"""
				parents = []
				for parent_ctx in rev_ctx.parents():
				parent_hg_nodeid = parent_ctx.node()
				# nullid is the value of a parent that does not exist
				marmouteUnsubmitted Done Inline Actions Why do we need a HEAD ? and why is it tip ? marmoute: Why do we need a HEAD ? and why is it tip ?
				marmouteUnsubmitted Done Inline Actions The questions still stand. marmoute: The questions still stand.
				acezarAuthorUnsubmitted Done Inline Actions Added comment stating that it match the historical implementation. acezar: Added comment stating that it match the historical implementation.
				if parent_hg_nodeid == hgutil.NULLID:
				marmouteUnsubmitted Done Inline Actions I am a bit warry about the reuse of internal attribute to fetch these information. Especially because they semantic is not documented so far. marmoute: I am a bit warry about the reuse of internal attribute to fetch these information. Especially…
				continue
				parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid))

				return tuple(parents)
				marmouteUnsubmitted Done Inline Actions "incremental loading" is probably an overloaded ambiguous terms, lets stop using it. What's you plan to change the way data are fed to this class when we implement something more efficient than recreating the full manifest for each revision. marmoute: - "incremental loading" is probably an overloaded ambiguous terms, lets stop using it.

				douarddaUnsubmitted Done Inline Actions I don't understand the way offsets is managed here. Why is it always negative? the `negative_utc` flag is supposed to be set true only for very rare cases coming from git, IIRC, in which one can use a date with UTC-0000 (which make no sense, but is possible and used for hash computation in git). douardda: I don't understand the way offsets is managed here. Why is it always negative? the…
				def store_revision(self, rev_ctx: hgutil.BaseContext) -> None:
				"""Store a revision given its hg nodeid.

				vlorentzUnsubmitted Done Inline Actions same comment as above, you can build the snapshot directly instead of using from_dict: for ...: branches[name] = SnapshotBranch( target=target, target_type=TargetType.REVISION, ) ... snapshot = Snapshot(branches=branches) should automatically compute the id vlorentz: same comment as above, you can build the snapshot directly instead of using from_dict: ``` for…
				marmouteUnsubmitted Done Inline Actions Same question about why we need a two steps approach here. marmoute: Same question about why we need a two steps approach here.
				Args:
				rev_ctx: the he revision context.

				Returns:
				the swhid of the stored revision.
				"""
				hg_nodeid = rev_ctx.node()

				root_swhid = self.store_directories(rev_ctx)

				# `Person.from_fullname` is compatible with mercurial's freeform author
				# as fullname is what is used in revision hash when available.
				author = Person.from_fullname(rev_ctx.user())
				marmouteUnsubmitted Done Inline Actions We probably want this to be a function on the object. This will abstract the in memory cache and allow collaboration with the SWH database when we implement incremental loading in the future. marmoute: We probably want this to be a function on the object. This will abstract the in memory cache…
				marmouteUnsubmitted Done Inline Actions I think that this feedback about `self._revision_nodeid_to_swhid[parent_hg_nodeid]` still apply. Am I missing something? marmoute: I think that this feedback about `self._revision_nodeid_to_swhid[parent_hg_nodeid]` still apply.
				acezarAuthorUnsubmitted Done Inline Actions I thought it was about the whole parent block hence why I added `self.get_revision_parents(rev_ctx)`. acezar: I thought it was about the whole parent block hence why I added `self.get_revision_parents…

				(timestamp, offset) = rev_ctx.date()

				douarddaUnsubmitted Done Inline Actions Unless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo (including file content) in RAM. This is a problem. douardda: Unless I'm wrong, this cache will be (potentially) huge, since it loads the whole hg repo…
				# TimestampWithTimezone.from_dict will change name
				# as it accept more than just dicts
				rev_date = TimestampWithTimezone.from_dict(int(timestamp))

				douarddaUnsubmitted Done Inline Actions still not satisfied by this "hide this under the carpet" path douardda: still not satisfied by this "hide this under the carpet" path
				marmouteUnsubmitted Done Inline Actions +1 marmoute: +1
				extra_headers = [
				(b"time_offset_seconds", str(offset).encode(),),
				]
				for key, value in rev_ctx.extra().items():
				# The default branch is skipped to match
				# the historical implementation
				if key == b"branch" and value == b"default":
				continue

				# transplant_source is converted to match
				# the historical implementation
				if key == b"transplant_source":
				value = hash_to_bytehex(value)
				marmouteUnsubmitted Done Inline Actions In practice, the current setup is not a cache, since it must be fully populated for the code qui already went over to work. So this is a mapping. In addition, the as I said before, the current execution order is not going to be usable as a base for future work. And we need a more revision centric approach (cf previous email). In addition, the current cache/mapping structure is wrong. What we need is a`cache[filename][filerevision(or filenode)] `→ SWHID cache. So that we don't have to recompute the hash of everything for every revision. The current` cache[changelog-rev][filename] → SWHID` is mostly loading all revision+manifest in memory is does not add much information compared to what is on disk. marmoute: In practice, the current setup is not a cache, since it must be fully populated for the code…
				extra_headers.append((key, value))

				douarddaUnsubmitted Done Inline Actions As already pointed, how will this behave with say mozilla or a linux repo? douardda: As already pointed, how will this behave with say mozilla or a linux repo?
				marmouteUnsubmitted Done Inline Actions The current plan (not implemented in this Diff), is to have a loader that does: for rev in repo: m = manifest for file in repo.manifest(rev): file_id = store_file(repo.file_content(file)) m.add_file(filename, file_id) manifest_id = store_manifest(m) store_rev(…, manifest_id) Then, by adding a simple LRU cache on the `file_id = store_file(repo.file_content(file))` line, we should get something capable to injest large repository. And then, we can start using manifest difference between the parent and the child, to directly re-use file_id and manifest computation from the parent. That should bring us to a reasonable complexity and okay-ish runtime for large repository. marmoute: The current plan (not implemented in this Diff), is to have a loader that does: ``` for rev in…
				revision = Revision(
				author=author,
				date=rev_date,
				committer=author,
				marmouteUnsubmitted Done Inline Actions It would be nice to document what a SWH's Release maps to in terms of Mercurial concept. marmoute: It would be nice to document what a SWH's Release maps to in terms of Mercurial concept.
				committer_date=rev_date,
				type=RevisionType.MERCURIAL,
				douarddaUnsubmitted Done Inline Actions I'm a bit worried how this is supposed to work: do we really recompute the hashes of file already seen (eg. a file present from rev 1 and then unmodified for all the revisions) in each revision? Or did I miss something? douardda: I'm a bit worried how this is supposed to work: do we really recompute the hashes of file…
				marmouteUnsubmitted Done Inline Actions cf my previous reply (currently, we do) marmoute: cf my previous reply (currently, we do)
				directory=root_swhid,
				message=rev_ctx.description(),
				metadata={"node": hg_nodeid.hex()},
				extra_headers=tuple(extra_headers),
				synthetic=False,
				parents=self.get_revision_parents(rev_ctx),
				)

				marmouteUnsubmitted Done Inline Actions We need to keep the tip tags because the previous loader was loading it ? Or is there another reason ? (It would be nice if the doc was clearer about that.) marmoute: We need to keep the tip tags because the previous loader was loading it ? Or is there another…
				acezarAuthorUnsubmitted Done Inline Actions was a typo: `tip must be ignored`. Because as said: `A release correspond to a user defined tag` and tip is not user defined. acezar: was a typo: `tip must be ignored`. Because as said: `A release correspond to a user defined…
				self._revision_nodeid_to_swhid[hg_nodeid] = revision.id
				self.storage.revision_add([revision])

				def store_release(self, name: bytes, target=Sha1Git) -> Sha1Git:
				"""Store a release given its name and its target.

				A release correspond to a user defined tag in mercurial.
				The mercurial api as a `tip` tag that must be ignored.
				marmouteUnsubmitted Done Inline Actions Why can't we simply split on `os.sep` ? If we can't we need to document why. marmoute: Why can't we simply split on ` os.sep` ? If we can't we need to document why.

				Args:
				name: name of the release.
				target: swhid of the target revision.

				Returns:
				marmouteUnsubmitted Done Inline Actions Same here, if would be nice to Document what's a SWH's content map to in terms of Mercurial concept (here: a file revision + some other stuff) marmoute: Same here, if would be nice to Document what's a SWH's content map to in terms of Mercurial…
				the swhid of the stored release.
				douarddaUnsubmitted Done Inline Actions Why is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial `repo` object? douardda: Why is this done "by hand" (i.e. reading the .hgtags file) instead of using the mercurial…
				"""
				release = Release(
				name=name,
				target=target,
				target_type=ObjectType.REVISION,
				message=None,
				metadata=None,
				synthetic=False,
				author=Person(name=None, email=None, fullname=b""),
				date=None,
				)

				marmouteUnsubmitted Done Inline Actions You should add a small inline comment to mention that caching is simple and will come in the next revision. marmoute: You should add a small inline comment to mention that caching is simple and will come in the…
				marmouteUnsubmitted Done Inline Actions (gentle ping) marmoute: (gentle ping)
				self.storage.release_add([release])

				return release.id

				def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content:
				"""Store a revision content hg nodeid and file path.

				Content is a mix of file content at a given revision
				and its permissions found in the changeset's manifest.

				marmouteUnsubmitted Done Inline Actions Since we don't use data/author on tag (release) yet. I recommend we split this in a different changeset. marmoute: Since we don't use data/author on tag (release) yet. I recommend we split this in a different…
				marmouteUnsubmitted Done Inline Actions What does to_model do ? Is it dropping the revision to the binary content of the file ? Why do we need the "isinstance" call for ? marmoute: What does to_model do ? Is it dropping the revision to the binary content of the file ? Why do…
				acezarAuthorUnsubmitted Done Inline Actions `to_model` converts `from_disk.Content` to `model.BaseContent` but `storage.content_add` only accepts `model.Content`. In our case we should only have `model.Content` from `to_model` anything else is actually not handled and should be an error. acezar: `to_model` converts `from_disk.Content` to `model.BaseContent` but `storage.content_add` only…
				marmouteUnsubmitted Done Inline Actions I see. What about the binary content of file. when is it dropped ? marmoute: I see. What about the binary content of file. when is it dropped ?
				acezarAuthorUnsubmitted Done Inline Actions Made sure that only the necessary data is returned at the en of the method. Garbage collector should cleanup thing if `storage` does not hold reference until flush. acezar: Made sure that only the necessary data is returned at the en of the method. Garbage collector…
				marmouteUnsubmitted Done Inline Actions Sure, but which part is dropping the reference to the actual file data ? The data are passed into: `content_data`, which is passed into `Content`, which is passed to `ModelContent` which still hold the data because `self.storage.content_add` is called on it. marmoute: Sure, but which part is dropping the reference to the actual file data ? The data are passed…
				marmouteUnsubmitted Done Inline Actions Never mind I just noticed that we return a new object at the end of the method: # Here we make sure to return only necessary data. return Content({"sha1_git": content.hash, "perms": perms}) Is this code new, or did I missed it on the first pass ? marmoute: Never mind I just noticed that we return a new object at the end of the method: ``` #…
				marmouteUnsubmitted Done Inline Actions This is indeed new code. marmoute: This is indeed new code.
				acezarAuthorUnsubmitted Done Inline Actions New in this diff, but was already in the next revision: D4541 (without the comment). acezar: New in this diff, but was already in the next revision: D4541 (without the comment).
				Args:
				rev_ctx: the he revision context.
				file_path: the hg path of the content.

				Returns:
				the swhid of the top level directory.
				"""
				hg_nodeid = rev_ctx.node()
				file_ctx = rev_ctx[file_path]

				perms = FLAG_PERMS[file_ctx.flags()]
				marmouteUnsubmitted Done Inline Actions Same here, please document what a Directory maps to in terms of Mercurial concepts. marmoute: Same here, please document what a Directory maps to in terms of Mercurial concepts.
				data = file_ctx.data() # caching is simple and will come in the next revision.

				content_data = MultiHash.from_data(data).digest()
				content_data["length"] = len(data)
				content_data["perms"] = perms
				marmouteUnsubmitted Done Inline Actions Why do we need to implement our own things for this ? doesn't SWH already have an utility for this ? If we need something special here, you need to document what it is? And if something special is needed, we should consider implementing it in the upstream module instead. marmoute: Why do we need to implement our own things for this ? doesn't SWH already have an utility for…
				content_data["data"] = data
				content_data["status"] = "visible"
				douarddaUnsubmitted Done Inline Actions "compute recursively compute " douardda: "compute recursively compute "
				content = Content(content_data)

				model = content.to_model()
				douarddaUnsubmitted Done Inline Actions typos: "when [...] is implemented, it will [...]" "recaclulate" Also I don't get how the "recalculate only" is foreseen here. douardda: typos: - "when [...] is implemented, it will [...]" - "recaclulate" Also I don't get how the…
				if isinstance(model, ModelContent):
				self.storage.content_add([model])
				else:
				raise ValueError(
				f"{file_path!r} at rev {hg_nodeid.hex()!r} "
				"produced {type(model)!r} instead of {ModelContent!r}"
				)
				marmouteUnsubmitted Done Inline Actions `.pop(0)` is quite expensive, could we reverse the list andusing `.pop()` ? if not, lets use a better datastructure for this. marmoute: `.pop(0)` is quite expensive, could we reverse the list andusing `.pop()` ? if not, lets use a…
				marmouteUnsubmitted Done Inline Actions gentle ping marmoute: gentle ping

				vlorentzUnsubmitted Done Inline Actions prefer the type `Sha1Git` (from swh.model.model), it's an alias of `bytes` vlorentz: prefer the type `Sha1Git` (from swh.model.model), it's an alias of `bytes`
				# Here we make sure to return only necessary data.
				return Content({"sha1_git": content.hash, "perms": perms})

				def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git:
				"""Store a revision directories given its hg nodeid.

				Mercurial as no directory as in git. A Git like tree must be build
				from file paths to obtain each directory hash.
				marmouteUnsubmitted Done Inline Actions What is this class about ? marmoute: What is this class about ?

				Args:
				rev_ctx: the he revision context.

				Returns:
				the swhid of the top level directory.
				"""
				root = HgDirectory()
				for file_path in rev_ctx.manifest():
				content = self.store_content(rev_ctx, file_path)
				root[file_path] = content

				directories: Deque[Directory] = deque([root])
				while directories:
				directory = directories.pop()
				vlorentzUnsubmitted Done Inline Actions you may want to use `errors='replace'` here, in case the name can't be decoded vlorentz: you may want to use `errors='replace'` here, in case the name can't be decoded
				self.storage.directory_add([directory.to_model()])
				marmouteUnsubmitted Done Inline Actions what is different compared to the base class ? marmoute: what is different compared to the base class ?
				marmouteUnsubmitted Done Inline Actions As we discussed before, we need to better define Archive here. marmoute: As we discussed before, we need to better define Archive here.
				directories.extend(
				[item for item in directory.values() if isinstance(item, Directory)]
				)

				return root.hash


				class HgArchiveLoaderFromDisk(HgLoaderFromDisk):
				"""Mercurial loader for repository wrapped within tarballs."""
				douarddaUnsubmitted Done Inline Actions It's unclear to me what this tree structure is really for. Couldn't `swh.model.from_disk` data structure be used for this instead of reimplementing it? douardda: It's unclear to me what this tree structure is really for. Couldn't `swh.model.from_disk` data…

				marmouteUnsubmitted Done Inline Actions same question: what is different compared to the base class ? marmoute: same question: what is different compared to the base class ?
				def __init__(
				self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None
				):
				super().__init__(
				url,
				visit_date=visit_date,
				logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk",
				marmouteUnsubmitted Done Inline Actions please define the data we are trying to acquire here. marmoute: please define the data we are trying to acquire here.
				)
				self.temp_dir = None
				self.archive_path = archive_path

				def prepare(self, args, *kwargs):
				"""Extract the archive instead of cloning."""
				self._temp_directory = tmp_extract(
				archive=self.archive_path,
				dir=self._temp_directory,
				prefix=TEMPORARY_DIR_PREFIX_PATTERN,
				suffix=f".dump-{os.getpid()}",
				log=self.log,
				source=self.origin_url,
				)

				repo_name = os.listdir(self.temp_dir)[0]
				self.directory = os.path.join(self.temp_dir, repo_name)
				super().prepare(args, *kwargs)

				def cleanup(self) -> None:
				"""Remove the extracted archive instead of the cloned repository."""
				if self.temp_dir and os.path.exists(self.temp_dir):
				rmtree(self.temp_dir)
				super().cleanup()


				# Allow direct usage of the loader from the command line with
				# `python -m swh.loader.mercurial.from_disk $ORIGIN_URL`
				if __name__ == "__main__":
				import logging

				import click

				logging.basicConfig(
				level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s"
				)

				@click.command()
				@click.option("--origin-url", help="origin url")
				@click.option("--hg-directory", help="Path to mercurial repository to load")
				@click.option("--visit-date", default=None, help="Visit date")
				def main(origin_url, hg_directory, visit_date):
				return HgLoaderFromDisk(
				origin_url, directory=hg_directory, visit_date=visit_date
				).load()

				main()
				douarddaUnsubmitted Done Inline Actions what? a function located after the `if __name__ == "__main__"`? douardda: what? a function located //after// the `if __name__ == "__main__"`?
				marmouteUnsubmitted Done Inline Actions Mercurial author field are free form. Assuming git formatted author field is going to create issue. What do we actually need to store in SWH ? marmoute: Mercurial author field are free form. Assuming git formatted author field is going to create…
				acezarAuthorUnsubmitted Done Inline Actions revision hash use the freeform fullname when provided and we create author using `Person.from_fullname` acezar: revision hash use the freeform fullname when provided and we create author using `Person.
				marmouteUnsubmitted Done Inline Actions Is this documented ? marmoute: Is this documented ?
				acezarAuthorUnsubmitted Done Inline Actions Added comment acezar: Added comment
				marmouteUnsubmitted Done Inline Actions (please move this before the `if __name__ == "__main__"` block.) The semantic of this function needs to be document, What is the actual information we fetch, and why do we needs it. marmoute: (please move this before the `if __name__ == "__main__"` block.) The semantic of this function…