Differential D395 Diff 1240 swh/loader/mercurial/bundle20_loader.py

Changeset View

Standalone View

swh/loader/mercurial/bundle20_loader.py

Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	def prepare(self, *, origin_url, visit_date, directory=None):
origin_url (str): Origin url to load		origin_url (str): Origin url to load
visit_date (str/datetime): Date of the visit		visit_date (str/datetime): Date of the visit
directory (str/None): The local directory to load		directory (str/None): The local directory to load

"""		"""
self.branches = {}		self.branches = {}
self.tags = []		self.tags = []
self.releases = {}		self.releases = {}
		self.node_2_rev = {}

if not directory: # remote repository		if not directory: # remote repository
self.working_directory = mkdtemp(		self.working_directory = mkdtemp(
prefix=TEMPORARY_DIR_PREFIX_PATTERN,		prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix='-%s' % os.getpid(),		suffix='-%s' % os.getpid(),
dir=self.temp_directory)		dir=self.temp_directory)
os.makedirs(self.working_directory, exist_ok=True)		os.makedirs(self.working_directory, exist_ok=True)
self.hgdir = self.working_directory		self.hgdir = self.working_directory
▲ Show 20 Lines • Show All 187 Lines • ▼ Show 20 Lines	def get_directories(self):
missing_dirs = self.storage.directory_missing(missing_dirs)		missing_dirs = self.storage.directory_missing(missing_dirs)

for _id in missing_dirs:		for _id in missing_dirs:
yield dirs[_id]		yield dirs[_id]
dirs = {}		dirs = {}

def get_revisions(self):		def get_revisions(self):
"""Get the revisions that need to be loaded."""		"""Get the revisions that need to be loaded."""
node_2_rev = {}
revisions = {}		revisions = {}
self.num_revisions = 0		self.num_revisions = 0
for header, commit in self.br.yield_all_changesets():		for header, commit in self.br.yield_all_changesets():
if header['node'] in self.reduce_effort:		if header['node'] in self.reduce_effort:
continue		continue

self.num_revisions += 1		self.num_revisions += 1
date_dict = identifiers.normalize_timestamp(		date_dict = identifiers.normalize_timestamp(
Show All 27 Lines	def get_revisions(self):
['time_offset_seconds',		['time_offset_seconds',
str(commit['time_offset_seconds']).encode('utf-8')],		str(commit['time_offset_seconds']).encode('utf-8')],
] + extra_meta		] + extra_meta
},		},
'synthetic': False,		'synthetic': False,
'parents': []		'parents': []
}		}

p1 = node_2_rev.get(header['p1'])		p1 = self.node_2_rev.get(header['p1'])
p2 = node_2_rev.get(header['p2'])		p2 = self.node_2_rev.get(header['p2'])
if p1:		if p1:
revision['parents'].append(p1)		revision['parents'].append(p1)
if p2:		if p2:
revision['parents'].append(p2)		revision['parents'].append(p2)

revision['id'] = hashutil.hash_to_bytes(		revision['id'] = hashutil.hash_to_bytes(
identifiers.revision_identifier(revision)		identifiers.revision_identifier(revision)
)		)
node_2_rev[header['node']] = revision['id']		self.node_2_rev[header['node']] = revision['id']
revisions[revision['id']] = revision		revisions[revision['id']] = revision

# Converts heads to use swh ids		# Converts heads to use swh ids
self.heads = {		self.heads = {
branch_name: node_2_rev[node_id]		branch_name: self.node_2_rev[node_id]
for branch_name, node_id in self.heads.items()		for branch_name, node_id in self.heads.items()
}		}

node_2_rev = None

missing_revs = revisions.keys()		missing_revs = revisions.keys()
if missing_revs:		if missing_revs:
missing_revs = set(		missing_revs = set(
self.storage.revision_missing(list(missing_revs))		self.storage.revision_missing(list(missing_revs))
)		)

for r in missing_revs:		for r in missing_revs:
yield revisions[r]		yield revisions[r]
self.mnode_to_tree_id = None		self.mnode_to_tree_id = None

def _read_tag(self, tag, split_byte=b' '):		def _read_tag(self, tag, split_byte=b' '):
node, *name = tag.split(split_byte)		node, *name = tag.split(split_byte)
name = split_byte.join(name)		name = split_byte.join(name)
return node, name		return node, name

def get_releases(self):		def get_releases(self):
"""Get the releases that need to be loaded."""		"""Get the releases that need to be loaded."""
self.num_releases = 0		self.num_releases = 0
releases = {}		releases = {}
missing_releases = []		missing_releases = []
for t in self.tags:		for t in self.tags:
self.num_releases += 1		self.num_releases += 1
node, name = self._read_tag(t)		node, name = self._read_tag(t)
node = node.decode()		node = node.decode()
		node_bytes = hashutil.hash_to_bytes(node)
if not TAG_PATTERN.match(node):		if not TAG_PATTERN.match(node):
self.log.warn('Wrong pattern (%s) found in tags. Skipping' % (		self.log.warn('Wrong pattern (%s) found in tags. Skipping' % (
node, ))		node, ))
continue		continue
		if node_bytes not in self.node_2_rev:
		self.log.warn('No matching revision for tag %s '
		'(hg changeset: %s). Skipping' %
		(name.decode(), node))
		continue
		tgt_rev = self.node_2_rev[node_bytes]
release = {		release = {
'name': name,		'name': name,
'target': hashutil.hash_to_bytes(node),		'target': tgt_rev,
		ardumontUnsubmitted Not Done Inline Actions Yes, that's the T1155 fix, targetting the swh revision, thanks. ardumont: Yes, that's the T1155 fix, targetting the swh revision, thanks.
'target_type': 'revision',		'target_type': 'revision',
'message': None,		'message': None,
'metadata': None,		'metadata': None,
'synthetic': False,		'synthetic': False,
'author': {'name': None, 'email': None, 'fullname': b''},		'author': {'name': None, 'email': None, 'fullname': b''},
'date': None		'date': None
		ardumontUnsubmitted Not Done Inline Actions But I'm not sure we want the message, author, and date since that does not exist in mercurial. From the .hgtags file, we have solely the name and the targetted changeset. From [1], a `tag is a symbolic identifier for a changeset` so indeed, does not contain much. Here, we derived those information from the revision. We kinda force the symbolic resolution. (Also we fetch the revision from the storage)... [1] https://www.mercurial-scm.org/wiki/Tag (careful now, that page says that some information may be no longer true, without specifying what is what...) ardumont: But I'm not sure we want the message, author, and date since that does not exist in mercurial.
		anlambertAuthorUnsubmitted Not Done Inline Actions My point here is to stick to the swh data model (not the mercurial one) and add some relevant info regarding a release (date is quite important for instance). I agree that this is pure data duplication but for those who query the /release api endpoint or browse a release page, it may be relevant to add those info. For instance, the page https://archive.softwareheritage.org/browse/release/ac77c274b0e8060d349a1a7e8a5fa7fae12f4ca3/?origin=https://273280703-shooter-player.googlecode.com/hg/ looks sad without them. anlambert: My point here is to stick to the swh data model (not the mercurial one) and add some relevant…
		ardumontUnsubmitted Not Done Inline Actions My point here is to stick to the swh data model (not the mercurial one) But, aren't we supposed to adhere to both? For me, we do. and add some relevant info regarding a release (date is quite important for instance). My take is that it's not for us to decide, we must state facts about the mercurial repository. The fact for me is that there exists a tag without anything except a name and a target. That translates in swh as a release with a name and a revision target, nothing more. I agree that this is pure data duplication but for those who query the /release api endpoint or browse a release page, it may be relevant to add those info. Yes, but they already can by clicking one more link, the target one you fixed (well, they will at some point in the future, when we merge, deploy, clean up, etc...). Also, i fear that modifying this for the archive's benefit is conflating multiple things. Archive's concern should be in webapp, not pushed on the loader. @zack any thoughts on this? In any case, note that discussing with you some more, it appears the mercurial web interfaces actually shows the changeset when showing the release. So they solve the indirection in their ui. That lessen my point a little, but not much. The indirection solving could be implemented in the archive side (for origin type 'hg') where i think it belongs. ardumont: > My point here is to stick to the swh data model (not the mercurial one) But, aren't we…
		zackUnsubmitted Not Done Inline Actions (thanks for highlighting me, I wasn't following this) The guiding principle for importing non-synthetic objects has indeed always been to represent as faithfully as possible the information available in the original source code origin. So I concur that, if hg releases do not have timestamps, we should not copy there the timestamps of the pointed revisions. (And that is supported by our underlying data model, where timestamps for release objects are nullable.) Same for other metadata that are not strictly required by our data model. I totally understand the visualization constraint, though. But that can be fixed by heuristics, e.g., if the timestamp is null, try to lookup the timestamp of the pointed object and show that instead (maybe with a visual clue that helps users differentiate the two cases). FWIW, I don't think that should be done only for objects coming from specific origins, it's just a general thing. Also, the API should not do the same, and rather not return a timestamp if it's not in the object. zack: (thanks for highlighting me, I wasn't following this) The guiding principle for importing non…
		anlambertAuthorUnsubmitted Not Done Inline Actions Thanks for clarifying the situation, diff has been updated accordingly. anlambert: Thanks for clarifying the situation, diff has been updated accordingly.
}		}
id_hash = hashutil.hash_to_bytes(		id_hash = hashutil.hash_to_bytes(
identifiers.release_identifier(release))		identifiers.release_identifier(release))
release['id'] = id_hash		release['id'] = id_hash
missing_releases.append(id_hash)		missing_releases.append(id_hash)
releases[id_hash] = release		releases[id_hash] = release
self.releases[name] = id_hash		self.releases[name] = id_hash

▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines