Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9749650
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
View Options
diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py
index d0f79d5..c06790b 100644
--- a/swh/loader/mercurial/slow_loader.py
+++ b/swh/loader/mercurial/slow_loader.py
@@ -1,460 +1,473 @@
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# WARNING WARNING WARNING WARNING
# hglib is too slow to be super useful. Unfortunately it's also the only
# python3 library for mercurial as of this writing. - Avi
import datetime
import hglib
import os
from swh.model import identifiers, hashutil
from swh.loader.core.loader import SWHStatelessLoader
from .converters import parse_author, PRIMARY_ALGO as ALGO
OS_PATH_SEP = os.path.sep.encode('utf-8')
def data_to_content_id(data):
size = len(data)
ret = {
'length': size,
}
ret.update(identifiers.content_identifier({'data': data}))
return ret
def blob_to_content_dict(data, existing_hashes=None, max_size=None,
logger=None):
"""Convert blob data to a SWH Content. If the blob already
has hashes computed, don't recompute them.
TODO: This should be unified with similar functions in other places.
args:
existing_hashes: dict of hash algorithm:value pairs
max_size: size over which blobs should be rejected
logger: logging class instance
returns:
A Software Heritage "content".
"""
existing_hashes = existing_hashes or {}
size = len(data)
content = {
'length': size,
}
content.update(existing_hashes)
hash_types = list(existing_hashes.keys())
hashes_to_do = hashutil.DEFAULT_ALGORITHMS.difference(hash_types)
content.update(hashutil.hash_data(data, algorithms=hashes_to_do))
if max_size and (size > max_size):
content.update({
'status': 'absent',
'reason': 'Content too large',
})
if logger:
id_hash = hashutil.hash_to_hex(content[ALGO])
logger.info(
'Skipping content %s, too large (%s > %s)'
% (id_hash, size, max_size),
extra={
'swh_type': 'loader_content_skip',
'swh_id': id_hash,
'swh_size': size
}
)
else:
content.update({'data': data, 'status': 'visible'})
return content
class SimpleBlob:
""" Stores basic metadata for a blob object.
"""
kind = 'file'
def __init__(self, file_hash, file_mode):
self.hash = file_hash
if not isinstance(file_mode, int):
self.mode = 0o100000 + int(file_mode, 8)
else:
self.mode = file_mode
class SimpleTree(dict):
""" Stores metadata for a nested 'tree'-like object.
"""
kind = 'dir'
mode = 0o040000
def add_tree_node_for_path(self, path):
"""Deeply nests SimpleTrees according to a directory path and returns
a cursor to the deepest one"""
node = self
for d in path.split(OS_PATH_SEP):
node = node.setdefault(d, SimpleTree())
return node
def remove_tree_node_for_path(self, path):
"""Deletes a SimpleBlob from inside nested SimpleTrees according to
the given file path"""
first, sep, rest = path.partition(OS_PATH_SEP)
if rest:
self[first].remove_tree_node_for_path(rest)
if not self.get(first):
del self[first]
else:
del self[first]
def add_blob(self, file_path, file_hash, file_mode):
"""Deeply nests a SimpleBlob inside nested SimpleTrees according to
the given file path"""
fdir = os.path.dirname(file_path)
fbase = os.path.basename(file_path)
if fdir:
node = self.add_tree_node_for_path(fdir)
else:
node = self
node[fbase] = SimpleBlob(file_hash, file_mode)
class HgLoader(SWHStatelessLoader):
"""Load a mercurial repository from a directory.
"""
CONFIG_BASE_FILENAME = 'loader/hg'
def __init__(self, logging_class='swh.loader.mercurial.HgLoader'):
super().__init__(logging_class=logging_class)
def prepare(self, origin_url, directory, visit_date):
"""see base.BaseLoader.prepare"""
- self.origin_url = origin_url
+ self.origin = {
+ 'type': 'hg',
+ 'url': origin_url
+ }
self.repo = hglib.open(directory)
self.visit_date = visit_date
self.node_to_blob_hash = {}
self.blob_hash_to_file_rev = {}
self.commit_trees = {}
self.unique_trees = {}
self.revisions = {}
def get_origin(self):
"""Get the origin that is currently being loaded in format suitable for
swh.storage"""
- return {
- 'type': 'hg',
- 'url': self.origin_url
- }
+ return self.origin
def fetch_data(self):
"""Fetch the data from the data source"""
pass
def has_contents(self):
"""Checks whether we need to load contents"""
# if we have any revisions, then obviously we have contents.
return self.has_revisions()
def iter_changelog(self):
"""Iterate over the repository log"""
yield from self.repo.log('0:tip', removed=True)
def get_node_file_if_new(self, f, rev, node_hash):
"""Load a blob from disk"""
# Fast if the node hash is already cached. Somehow this shortcuts a
# meaningful but not huge percentage of the loads for a repository.
if node_hash not in self.node_to_blob_hash:
file_path = os.path.join(self.repo.root(), f)
data = self.repo.cat([file_path], rev)
blob_hash = identifiers.content_identifier(
{'data': data}
)[ALGO]
self.node_to_blob_hash[node_hash] = blob_hash
if blob_hash not in self.blob_hash_to_file_rev:
# new blob
self.blob_hash_to_file_rev[blob_hash] = (file_path, rev)
return blob_hash, data
return self.node_to_blob_hash[node_hash], None
def get_content_ids(self):
"""Get all the contents, but trim away the actual data"""
self.node_to_blob_hash = {}
self.blob_hash_to_file_rev = {}
self.num_contents = 0
for li in self.iter_changelog():
c = self.repo[li]
rev = c.rev()
manifest = c.manifest()
for f in c.added() + c.modified():
node_hash = manifest[f]
blob_hash, data = self.get_node_file_if_new(f, rev, node_hash)
if data is not None: # new blob
self.num_contents += 1
yield data_to_content_id(data)
def get_contents(self):
"""Get the contents that need to be loaded"""
# This method unfortunately loads and hashes the blobs twice.
max_content_size = self.config['content_size_limit']
missing_contents = set(
self.storage.content_missing(
self.get_content_ids(),
ALGO
)
)
for oid in missing_contents:
file_path, rev = self.blob_hash_to_file_rev[oid]
data = self.repo.cat([file_path], rev)
yield blob_to_content_dict(
data, max_size=max_content_size, logger=self.log
)
def has_directories(self):
"""Checks whether we need to load directories"""
# if we have any revs, we must also have dirs
return self.has_revisions()
def get_directories(self):
"""Get the directories that need to be loaded"""
missing_dirs = set(self.storage.directory_missing(
sorted(self.unique_trees.keys())
))
for dir_hash in missing_dirs:
yield self.unique_trees[dir_hash]
def has_revisions(self):
"""Checks whether we need to load revisions"""
self.num_revisions = int(self.repo.tip()[0]) + 1
return self.num_revisions > 0
def update_tree_from_rev(self, tree, rev, only_these_files=None):
"""Iterates over changes in a revision and adds corresponding
SimpleBlobs to a SimpleTree"""
if rev >= 0:
manifest = {k[4]: k for k in self.repo.manifest(rev=rev)}
loop_keys = only_these_files or manifest.keys()
for f in loop_keys:
node_hash = manifest[f][0]
file_mode = manifest[f][1]
file_hash, _ = self.get_node_file_if_new(f, rev, node_hash)
tree.add_blob(f, file_hash, file_mode)
return tree
def reconstruct_tree(self, directory):
"""Converts a flat directory into nested SimpleTrees."""
# This method exists because the code was already written to use
# SimpleTree before then reducing memory use and converting to the
# canonical format. A refactor using lookups instead of nesting could
# obviate the need.
new_tree = SimpleTree()
for entry in directory['entries']:
tgt = entry['target']
perms = entry['perms']
name = entry['name']
if tgt in self.unique_trees: # subtree
new_tree[name] = self.reconstruct_tree(self.unique_trees[tgt])
else: # blob
new_tree[name] = SimpleBlob(tgt, perms)
new_tree.hash = directory['id']
return new_tree
def collapse_tree(self, tree):
"""Converts nested SimpleTrees into multiple flat directories."""
# This method exists because the code was already written to use
# SimpleTree before then reducing memory use and converting to the
# canonical format. A refactor using lookups instead of nesting could
# obviate the need.
directory = {
'entries': [
{
'name': k,
'perms': v.mode,
'type': v.kind,
'target': (isinstance(v, SimpleBlob)
and v.hash
or self.collapse_tree(v))
}
for k, v in tree.items()
]
}
tree.hash = identifiers.directory_identifier(directory)
directory['id'] = tree.hash
self.unique_trees[tree.hash] = directory
return tree.hash
def get_revision_ids(self):
"""Get the revisions that need to be loaded"""
self.unique_trees = {}
commit_tree = None
for li in self.iter_changelog():
c = self.repo[li[1]]
rev = c.rev()
# start from the parent state
p1 = c.p1().rev()
if p1 in self.commit_trees:
if p1 != rev-1:
# Most of the time, a revision will inherit from the
# previous one. In those cases we can reuse commit_tree,
# otherwise build a new one here.
parent_tree = self.unique_trees[self.commit_trees[p1]]
commit_tree = self.reconstruct_tree(parent_tree)
else:
commit_tree = self.update_tree_from_rev(SimpleTree(), p1)
# remove whatever is removed
for f in c.removed():
commit_tree.remove_tree_node_for_path(f)
# update whatever is updated
self.update_tree_from_rev(commit_tree, rev, c.added()+c.modified())
self.commit_trees[rev] = self.collapse_tree(commit_tree)
date_dict = identifiers.normalize_timestamp(
int(c.date().timestamp())
)
author_dict = parse_author(c.author())
+ parents = []
+ for p in c.parents():
+ if p.rev() >= 0:
+ parents.append(self.revisions[p.node()]['id'])
+
+ phase = c.phase() # bytes
+ rev = str(rev).encode('utf-8')
+ hidden = str(c.hidden()).encode('utf-8')
+ hg_headers = [['phase', phase], ['rev', rev], ['hidden', hidden]]
+
revision = {
'author': author_dict,
'date': date_dict,
'committer': author_dict,
'committer_date': date_dict,
'type': 'hg',
- 'directory': commit_tree.hash,
+ 'directory': identifiers.identifier_to_bytes(commit_tree.hash),
'message': c.description(),
'metadata': {
- 'extra_headers': [
- ['phase', c.phase()],
- ['rev', rev],
- ['hidden', c.hidden()]
- ]
+ 'extra_headers': hg_headers
},
'synthetic': False,
- 'parents': [
- self.revisions[p.node()]['id'] for p in c.parents()
- if p.rev() >= 0
- ]
+ 'parents': parents,
}
- revision['id'] = identifiers.revision_identifier(revision)
+ revision['id'] = identifiers.identifier_to_bytes(
+ identifiers.revision_identifier(revision))
self.revisions[c.node()] = revision
for n, r in self.revisions.items():
yield {'node': n, 'id': r['id']}
def get_revisions(self):
"""Get the revision identifiers from the repository"""
- revs = {r['id']: r['node'] for r in self.get_revision_ids()}
+ revs = {
+ r['id']: r['node']
+ for r in self.get_revision_ids()
+ }
missing_revs = set(self.storage.revision_missing(revs.keys()))
for r in missing_revs:
yield self.revisions[revs[r]]
def has_releases(self):
"""Checks whether we need to load releases"""
self.num_releases = len([t for t in self.repo.tags() if not t[3]])
return self.num_releases > 0
def get_releases(self):
"""Get the releases that need to be loaded"""
releases = {}
for t in self.repo.tags():
islocal = t[3]
name = t[0]
if (name != b'tip' and not islocal):
short_hash = t[2]
target = self.revisions[self.repo[short_hash].node()]['id']
release = {
'name': name,
'target': target,
'target_type': 'revision',
'message': None,
'metadata': None,
'synthetic': False,
'author': None,
'date': None
}
id_hash = identifiers.release_identifier(release)
- release['id'] = id_hash
+ release['id'] = identifiers.identifier_to_bytes(id_hash)
releases[id_hash] = release
missing_rels = set(self.storage.release_missing(
sorted(releases.keys())
))
yield from (releases[r] for r in missing_rels)
- def has_occurrences(self):
- """Checks whether we need to load occurrences"""
- self.num_occurrences = len(
- self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0]
- )
- return self.num_occurrences > 0
+ def get_snapshot(self):
+ """Get the snapshot that need to be loaded"""
+ self.num_snapshot = 1
- def get_occurrences(self):
- """Get the occurrences that need to be loaded"""
- for t in (
- self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0]
- ):
- name = t[0]
- short_hash = t[2]
- target = self.revisions[self.repo[short_hash].node()]['id']
- yield {
- 'branch': name,
- 'origin': self.origin_id,
- 'target': target,
- 'target_type': 'revision',
- 'visit': self.visit,
+ def _get_branches(repo=self.repo):
+ for t in (
+ repo.tags() + repo.branches() + repo.bookmarks()[0]
+ ):
+ name = t[0]
+ short_hash = t[2]
+ node = self.repo[short_hash].node()
+ yield name, {
+ 'target': self.revisions[node]['id'],
+ 'target_type': 'revision'
+ }
+
+ snap = {
+ 'branches': {
+ name: branch
+ for name, branch in _get_branches()
}
+ }
+ snap['id'] = identifiers.identifier_to_bytes(
+ identifiers.snapshot_identifier(snap))
+ return snap
+
+ def flush(self):
+ pass
def get_fetch_history_result(self):
"""Return the data to store in fetch_history for the current loader"""
return {
'contents': self.num_contents,
'directories': len(self.unique_trees),
'revisions': self.num_revisions,
'releases': self.num_releases,
- 'occurrences': self.num_occurrences,
+ 'snapshot': self.num_snapshot,
}
def save_data(self):
"""We already have the data locally, no need to save it"""
pass
def eventful(self):
"""Whether the load was eventful"""
return True
if __name__ == '__main__':
import logging
import sys
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(process)d %(message)s'
)
loader = HgLoader()
origin_url = sys.argv[1]
directory = sys.argv[2]
visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
print(loader.load(origin_url, directory, visit_date))
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Aug 25, 6:04 PM (3 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3276107
Attached To
rDLDHG Mercurial loader
Event Timeline
Log In to Comment