Changeset View
Standalone View
swh/loader/mercurial/bundle20_loader.py
Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines | def prepare(self, *, origin_url, visit_date, directory=None): | ||||
origin_url (str): Origin url to load | origin_url (str): Origin url to load | ||||
visit_date (str/datetime): Date of the visit | visit_date (str/datetime): Date of the visit | ||||
directory (str/None): The local directory to load | directory (str/None): The local directory to load | ||||
""" | """ | ||||
self.branches = {} | self.branches = {} | ||||
self.tags = [] | self.tags = [] | ||||
self.releases = {} | self.releases = {} | ||||
self.node_2_rev = {} | |||||
if not directory: # remote repository | if not directory: # remote repository | ||||
self.working_directory = mkdtemp( | self.working_directory = mkdtemp( | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
suffix='-%s' % os.getpid(), | suffix='-%s' % os.getpid(), | ||||
dir=self.temp_directory) | dir=self.temp_directory) | ||||
os.makedirs(self.working_directory, exist_ok=True) | os.makedirs(self.working_directory, exist_ok=True) | ||||
self.hgdir = self.working_directory | self.hgdir = self.working_directory | ||||
▲ Show 20 Lines • Show All 187 Lines • ▼ Show 20 Lines | def get_directories(self): | ||||
missing_dirs = self.storage.directory_missing(missing_dirs) | missing_dirs = self.storage.directory_missing(missing_dirs) | ||||
for _id in missing_dirs: | for _id in missing_dirs: | ||||
yield dirs[_id] | yield dirs[_id] | ||||
dirs = {} | dirs = {} | ||||
def get_revisions(self): | def get_revisions(self): | ||||
"""Get the revisions that need to be loaded.""" | """Get the revisions that need to be loaded.""" | ||||
node_2_rev = {} | |||||
revisions = {} | revisions = {} | ||||
self.num_revisions = 0 | self.num_revisions = 0 | ||||
for header, commit in self.br.yield_all_changesets(): | for header, commit in self.br.yield_all_changesets(): | ||||
if header['node'] in self.reduce_effort: | if header['node'] in self.reduce_effort: | ||||
continue | continue | ||||
self.num_revisions += 1 | self.num_revisions += 1 | ||||
date_dict = identifiers.normalize_timestamp( | date_dict = identifiers.normalize_timestamp( | ||||
Show All 27 Lines | def get_revisions(self): | ||||
['time_offset_seconds', | ['time_offset_seconds', | ||||
str(commit['time_offset_seconds']).encode('utf-8')], | str(commit['time_offset_seconds']).encode('utf-8')], | ||||
] + extra_meta | ] + extra_meta | ||||
}, | }, | ||||
'synthetic': False, | 'synthetic': False, | ||||
'parents': [] | 'parents': [] | ||||
} | } | ||||
p1 = node_2_rev.get(header['p1']) | p1 = self.node_2_rev.get(header['p1']) | ||||
p2 = node_2_rev.get(header['p2']) | p2 = self.node_2_rev.get(header['p2']) | ||||
if p1: | if p1: | ||||
revision['parents'].append(p1) | revision['parents'].append(p1) | ||||
if p2: | if p2: | ||||
revision['parents'].append(p2) | revision['parents'].append(p2) | ||||
revision['id'] = hashutil.hash_to_bytes( | revision['id'] = hashutil.hash_to_bytes( | ||||
identifiers.revision_identifier(revision) | identifiers.revision_identifier(revision) | ||||
) | ) | ||||
node_2_rev[header['node']] = revision['id'] | self.node_2_rev[header['node']] = revision['id'] | ||||
revisions[revision['id']] = revision | revisions[revision['id']] = revision | ||||
# Converts heads to use swh ids | # Converts heads to use swh ids | ||||
self.heads = { | self.heads = { | ||||
branch_name: node_2_rev[node_id] | branch_name: self.node_2_rev[node_id] | ||||
for branch_name, node_id in self.heads.items() | for branch_name, node_id in self.heads.items() | ||||
} | } | ||||
node_2_rev = None | |||||
missing_revs = revisions.keys() | missing_revs = revisions.keys() | ||||
if missing_revs: | if missing_revs: | ||||
missing_revs = set( | missing_revs = set( | ||||
self.storage.revision_missing(list(missing_revs)) | self.storage.revision_missing(list(missing_revs)) | ||||
) | ) | ||||
for r in missing_revs: | for r in missing_revs: | ||||
yield revisions[r] | yield revisions[r] | ||||
self.mnode_to_tree_id = None | self.mnode_to_tree_id = None | ||||
def _read_tag(self, tag, split_byte=b' '): | def _read_tag(self, tag, split_byte=b' '): | ||||
node, *name = tag.split(split_byte) | node, *name = tag.split(split_byte) | ||||
name = split_byte.join(name) | name = split_byte.join(name) | ||||
return node, name | return node, name | ||||
def get_releases(self): | def get_releases(self): | ||||
"""Get the releases that need to be loaded.""" | """Get the releases that need to be loaded.""" | ||||
self.num_releases = 0 | self.num_releases = 0 | ||||
releases = {} | releases = {} | ||||
missing_releases = [] | missing_releases = [] | ||||
for t in self.tags: | for t in self.tags: | ||||
self.num_releases += 1 | self.num_releases += 1 | ||||
node, name = self._read_tag(t) | node, name = self._read_tag(t) | ||||
node = node.decode() | node = node.decode() | ||||
node_bytes = hashutil.hash_to_bytes(node) | |||||
if not TAG_PATTERN.match(node): | if not TAG_PATTERN.match(node): | ||||
self.log.warn('Wrong pattern (%s) found in tags. Skipping' % ( | self.log.warn('Wrong pattern (%s) found in tags. Skipping' % ( | ||||
node, )) | node, )) | ||||
continue | continue | ||||
if node_bytes not in self.node_2_rev: | |||||
self.log.warn('No matching revision for tag %s ' | |||||
'(hg changeset: %s). Skipping' % | |||||
(name.decode(), node)) | |||||
continue | |||||
tgt_rev = self.node_2_rev[node_bytes] | |||||
release = { | release = { | ||||
'name': name, | 'name': name, | ||||
'target': hashutil.hash_to_bytes(node), | 'target': tgt_rev, | ||||
ardumont: Yes, that's the T1155 fix, targetting the swh revision, thanks. | |||||
'target_type': 'revision', | 'target_type': 'revision', | ||||
'message': None, | 'message': None, | ||||
'metadata': None, | 'metadata': None, | ||||
'synthetic': False, | 'synthetic': False, | ||||
'author': {'name': None, 'email': None, 'fullname': b''}, | 'author': {'name': None, 'email': None, 'fullname': b''}, | ||||
'date': None | 'date': None | ||||
Not Done Inline ActionsBut I'm not sure we want the message, author, and date since that does not exist in mercurial. From [1], a tag is a symbolic identifier for a changeset so indeed, does not contain much. Here, we derived those information from the revision. We kinda force the symbolic resolution. [1] https://www.mercurial-scm.org/wiki/Tag ardumont: But I'm not sure we want the message, author, and date since that does not exist in mercurial. | |||||
Not Done Inline ActionsMy point here is to stick to the swh data model (not the mercurial one) and add some relevant info regarding a release (date is quite important for instance). I agree that this is pure data duplication but for those who query the /release api endpoint For instance, the page https://archive.softwareheritage.org/browse/release/ac77c274b0e8060d349a1a7e8a5fa7fae12f4ca3/?origin=https://273280703-shooter-player.googlecode.com/hg/ looks sad without them. anlambert: My point here is to stick to the swh data model (not the mercurial one) and add some relevant… | |||||
Not Done Inline Actions
But, aren't we supposed to adhere to both? For me, we do.
My take is that it's not for us to decide, we must state facts about the mercurial repository.
or browse a release page, it may be relevant to add those info. Yes, but they already can by clicking one more link, the target one you fixed (well, they will at some point in the future, when we merge, deploy, clean up, etc...). Also, i fear that modifying this for the archive's benefit is conflating multiple things. @zack any thoughts on this? In any case, note that discussing with you some more, it appears the mercurial web interfaces actually shows the changeset when showing the release. So they solve the indirection in their ui. That lessen my point a little, but not much. ardumont: > My point here is to stick to the swh data model (not the mercurial one)
But, aren't we… | |||||
Not Done Inline Actions(thanks for highlighting me, I wasn't following this) The guiding principle for importing non-synthetic objects has indeed always been to represent as faithfully as possible the information available in the original source code origin. So I concur that, if hg releases do not have timestamps, we should not copy there the timestamps of the pointed revisions. (And that is supported by our underlying data model, where timestamps for release objects are nullable.) Same for other metadata that are not strictly required by our data model. I totally understand the visualization constraint, though. But that can be fixed by heuristics, e.g., if the timestamp is null, try to lookup the timestamp of the pointed object and show that instead (maybe with a visual clue that helps users differentiate the two cases). FWIW, I don't think that should be done only for objects coming from specific origins, it's just a general thing. Also, the API should not do the same, and rather not return a timestamp if it's not in the object. zack: (thanks for highlighting me, I wasn't following this)
The guiding principle for importing non… | |||||
Not Done Inline ActionsThanks for clarifying the situation, diff has been updated accordingly. anlambert: Thanks for clarifying the situation, diff has been updated accordingly. | |||||
} | } | ||||
id_hash = hashutil.hash_to_bytes( | id_hash = hashutil.hash_to_bytes( | ||||
identifiers.release_identifier(release)) | identifiers.release_identifier(release)) | ||||
release['id'] = id_hash | release['id'] = id_hash | ||||
missing_releases.append(id_hash) | missing_releases.append(id_hash) | ||||
releases[id_hash] = release | releases[id_hash] = release | ||||
self.releases[name] = id_hash | self.releases[name] = id_hash | ||||
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines |
Yes, that's the T1155 fix, targetting the swh revision, thanks.