diff --git a/swh/loader/mercurial/bundle20_loader.py b/swh/loader/mercurial/bundle20_loader.py index 4b702cb..b227af7 100644 --- a/swh/loader/mercurial/bundle20_loader.py +++ b/swh/loader/mercurial/bundle20_loader.py @@ -1,292 +1,296 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains a SWH loader for ingesting repository data from Mercurial version 2 bundle files. """ # NOTE: The code here does expensive work twice in places because of the # intermediate need to check for what is missing before sending to the database # and the desire to not juggle very large amounts of data. # TODO: Decide whether to also serialize to disk and read back more quickly # from there. Maybe only for very large repos and fast drives. # - Avi import os import hglib -import base -import converters -from bundle20_reader import Bundle20Reader -from converters import PRIMARY_ALGO as ALGO -from objects import SelectiveCache, SimpleTree + from swh.model import hashutil, identifiers + +from . import converters +from .bundle20_reader import Bundle20Reader +from .converters import PRIMARY_ALGO as ALGO +from .objects import SelectiveCache, SimpleTree +from .base import BaseLoader + + DEBUG = True MAX_BLOB_SIZE = 100*1024*1024 # bytes # TODO: What should MAX_BLOB_SIZE be? -class HgBundle20Loader(base.BaseLoader): +class HgBundle20Loader(BaseLoader): CONFIG_BASE_FILENAME = 'loader/hg-loader' BUNDLE_FILENAME = 'HG20_none_bundle' def __init__(self): self.hg = None self.tags = [] def prepare(self, origin_url, directory, fetch_date): """see base.BaseLoader.prepare""" self.origin_url = origin_url self.fetch_date = fetch_date self.hgdir = directory bundle_path = os.path.join(directory, HgBundle20Loader.BUNDLE_FILENAME) if DEBUG and not os.path.isfile(bundle_path): # generate a bundle from the given directory if needed (testing) with hglib.open(directory) as repo: repo.bundle( bytes(bundle_path, 'utf-8'), all=True, type=b'none' ) self.br = Bundle20Reader(bundle_path) def get_origin(self): """Get the origin that is currently being loaded in format suitable for swh.storage.""" return { 'type': 'hg', 'url': self.origin_url } def fetch_data(self): """Fetch the data from the data source.""" pass def get_contents(self): """Get the contents that need to be loaded.""" self.file_node_to_hash = {} missing_contents = set() hash_to_info = {} self.num_contents = 0 for blob, node_info in self.br.yield_all_blobs(): self.num_contents += 1 file_name = node_info[0] header = node_info[2] blob_hash = hashutil.hash_data(blob, algorithms=set([ALGO]))[ALGO] self.file_node_to_hash[header['node']] = blob_hash hash_to_info[blob_hash] = node_info missing_contents.add(blob_hash) if file_name == b'.hgtags': # https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model self.tags = blob.split(b'\n') # overwrite until the last one if not DEBUG: missing_contents = set( self.storage.content_missing(iter(missing_contents), ALGO) ) # Clusters needed blobs by file offset and then only fetches the # groups at the needed offsets. focs = {} # "file/offset/contents" for blob_hash in missing_contents: _, file_offset, header = hash_to_info[blob_hash] focs.setdefault(file_offset, {}) focs[file_offset][header['node']] = blob_hash hash_to_info = None for offset, node_hashes in sorted(focs.items()): for header, data, *_ in self.br.yield_group_objects( group_offset=offset ): node = header['node'] if node in node_hashes: blob, meta = self.br.extract_meta_from_blob(data) yield converters.blob_to_content_dict( data=blob, existing_hashes={ALGO: node_hashes[node]}, max_size=MAX_BLOB_SIZE ) # # NOTE: This is a slower but cleaner version of the code above. # for blob, node_info in self.br.yield_all_blobs(): # header = node_info[2] # node = header['node'] # blob_hash = self.file_node_to_hash[node] # if blob_hash in missing_contents: # yield converters.blob_to_content_dict( # data=blob, # existing_hashes={ALGO: blob_hash}, # max_size=MAX_BLOB_SIZE # ) def load_directories(self): """This is where the work is done to convert manifest deltas from the repository bundle into SWH directories. """ self.mnode_to_tree_id = {} base_manifests = self.br.build_manifest_hints() def tree_size(t): return t.size() self.trees = SelectiveCache(cache_hints=base_manifests, size_function=tree_size) tree = SimpleTree() for header, added, removed in self.br.yield_all_manifest_deltas( base_manifests ): node = header['node'] basenode = header['basenode'] tree = self.trees.fetch(basenode) or tree # working tree for path in removed.keys(): tree = tree.remove_tree_node_for_path(path) for path, info in added.items(): file_node, is_symlink, perms_code = info tree = tree.add_blob( path, self.file_node_to_hash[file_node], is_symlink, perms_code ) new_dirs = [] self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs) self.trees.store(node, tree) yield header, tree, new_dirs def get_directories(self): """Get the directories that need to be loaded.""" missing_dirs = [] self.num_directories = 0 for header, tree, new_dirs in self.load_directories(): for d in new_dirs: self.num_directories += 1 missing_dirs.append(d['id']) missing_dirs = set(missing_dirs) if not DEBUG: missing_dirs = set( self.storage.directory_missing(missing_dirs) ) for header, tree, new_dirs in self.load_directories(): for d in new_dirs: if d['id'] in missing_dirs: yield d def get_revisions(self): """Get the revisions that need to be loaded.""" self.branches = {} revisions = {} self.num_revisions = 0 for header, commit in self.br.yield_all_changesets(): self.num_revisions += 1 date_dict = identifiers.normalize_timestamp( int(commit['time'].timestamp()) ) author_dict = converters.parse_author(commit['user']) if commit['manifest'] == Bundle20Reader.NAUGHT_NODE: directory_id = SimpleTree().hash_changed() else: directory_id = self.mnode_to_tree_id[commit['manifest']] extra_meta = [] extra = commit.get('extra') if extra: for e in extra.split(b'\x00'): k, v = e.split(b':', 1) k = k.decode('utf-8') extra_meta.append([k, v]) if k == 'branch': # needed for Occurrences self.branches[v] = header['node'] revision = { 'author': author_dict, 'date': date_dict, 'committer': author_dict, 'committer_date': date_dict, 'type': 'hg', 'directory': directory_id, 'message': commit['message'], 'metadata': { 'node': header['node'], 'extra_headers': [ ['time_offset_seconds', commit['time_offset_seconds']], ] + extra_meta }, 'synthetic': False, 'parents': [ header['p1'], header['p2'] ] } revision['id'] = identifiers.revision_identifier(revision) revisions[revision['id']] = revision missing_revs = revisions.keys() if not DEBUG: missing_revs = set( self.storage.revision_missing(missing_revs) ) for r in missing_revs: yield revisions[r] self.mnode_to_tree_id = None def get_releases(self): """Get the releases that need to be loaded.""" releases = {} self.num_releases = 0 for t in self.tags: self.num_releases += 1 node, name = t.split(b' ') release = { 'name': name, 'target': node, 'target_type': 'revision', 'message': None, 'metadata': None, 'synthetic': False, 'author': None, 'date': None } id_hash = identifiers.release_identifier(release) release['id'] = id_hash releases[id_hash] = release yield from releases.values() def get_occurrences(self): """Get the occurrences that need to be loaded.""" self.num_occurrences = 0 for name, target in self.branches.items(): self.num_occurrences += 1 yield { 'branch': name, 'origin': self.origin_url, 'target': target, 'target_type': 'revision', 'visit': self.visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history.""" return { 'contents': self.num_contents, 'directories': self.num_directories, 'revisions': self.num_revisions, 'releases': self.num_releases, 'occurrences': self.num_occurrences } diff --git a/swh/loader/mercurial/bundle20_loader_verifier.py b/swh/loader/mercurial/bundle20_loader_verifier.py index d566c59..bf229b5 100644 --- a/swh/loader/mercurial/bundle20_loader_verifier.py +++ b/swh/loader/mercurial/bundle20_loader_verifier.py @@ -1,217 +1,218 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import code +import hglib import os import random import sys import time -from binascii import hexlify, unhexlify -import hglib +from binascii import hexlify, unhexlify -from bundle20_loader import HgBundle20Loader -from converters import PRIMARY_ALGO as ALGO -from objects import SimpleTree from swh.model import hashutil +from .bundle20_loader import HgBundle20Loader +from .converters import PRIMARY_ALGO as ALGO +from .objects import SimpleTree + class HgLoaderValidater(HgBundle20Loader): def generate_all_blobs(self, validate=True, frequency=1): print('GENERATING BLOBS') i = 0 start = time.time() u = set() for blob, node_info in self.br.yield_all_blobs(): filename = node_info[0] header = node_info[2] i += 1 bhash = hashutil.hash_data(blob, algorithms=set([ALGO]))[ALGO] self.file_node_to_hash[header['node']] = bhash u.update([bhash]) if validate: if random.random() < frequency: self.validate_blob(filename, header, blob) if i % 10000 == 0: print(i) print('') print('FOUND', i, 'BLOBS') print('FOUND', len(u), 'UNIQUE BLOBS') print('ELAPSED', time.time()-start) def validate_blob(self, filename, header, blob): if not self.hg: self.hg = hglib.open(self.hgdir) data = bytes(blob) filepath = os.path.join(self.hg.root(), bytes(filename)) linknode = hexlify(header['linknode']) cat_contents = self.hg.cat([filepath], rev=linknode) if cat_contents != data: print('INTERNAL ERROR ERROR ERROR ERROR') print(filename) print(header) print('-----') print(cat_contents) print('---- vs ----') print(data) code.interact(local=dict(globals(), **locals())) quit() else: print('v', end='') def generate_all_trees(self, validate=True, frequency=1): print('GENERATING MANIFEST TREES') c = 0 n = 0 u = set() start = time.time() validated = 0 for header, tree, new_dirs in self.load_directories(): if validate and (c >= validated) and (random.random() < frequency): self.validate_tree(tree, header, c) for d in new_dirs: u.add(d['id']) c += 1 n += len(new_dirs) print('.', end='') if c % 20 == 0: sys.stdout.flush() if c % 10000 == 0: print(c) print('') print('FOUND', c, 'COMMIT MANIFESTS') print('FOUND', n, 'NEW DIRS') print('FOUND', len(u), 'UNIQUE DIRS') print('ELAPSED', time.time()-start) def validate_tree(self, tree, header, i): if not self.hg: self.hg = hglib.open(self.hgdir) commit_id = header['linknode'] if len(commit_id) == 20: commit_id = hexlify(commit_id) base_tree = SimpleTree() base_files = list(self.hg.manifest(rev=commit_id)) bfiles = sorted([f[4] for f in base_files]) for p in base_files: base_tree.add_blob( p[4], self.file_node_to_hash[unhexlify(p[0])], p[3], p[1] ) base_tree.hash_changed() files = sorted(list(tree.flatten().keys())) if tree != base_tree: print('validating rev:', i, 'commit:', commit_id) print('validating files:', len(files), len(base_files)) print(' INVALID TREE') def so1(a): keys = [k['name'] for k in a['entries']] return b''.join(sorted(keys)) tree_dirs = [d for d in tree.yield_SWH_directories()] base_dirs = [d for d in base_tree.yield_SWH_directories()] tree_dirs.sort(key=so1) base_dirs.sort(key=so1) # for i in range(len(tree_dirs)): # if tree_dirs[i] != base_dirs[i]: # print(i) # code.interact(local=dict(globals(), **locals())) print("Program will quit after your next Ctrl-D") code.interact(local=dict(globals(), **locals())) quit() else: print('v', end='') def generate_all_commits(self, validate=True, frequency=1): i = 0 start = time.time() for rev in self.get_revisions(): print('.', end='') i += 1 if i % 20 == 0: sys.stdout.flush() print('') print('FOUND', i, 'COMMITS') print('ELAPSED', time.time()-start) def runtest(self, hgdir, validate_blobs=False, validate_trees=False, frequency=1.0): """ HgLoaderValidater().runtest('/home/avi/SWH/mozilla-unified') """ self.prepare('nil', hgdir, 'nil') self.file_node_to_hash = {} # blobs_pickle = 'blob_hashes.' + os.path.basename(hgdir) + '.pickle' # if os.path.exists(blobs_pickle): # with open(blobs_pickle, 'rb') as pfile: # self.file_node_to_hash = pickle.load(pfile) # if not self.file_node_to_hash: # self.generate_all_blobs(validate=validate_blobs, # frequency=frequency) # with open(blobs_pickle, 'wb') as pfile: # pickle.dump(self.file_node_to_hash, pfile) # self.generate_all_trees(validate=validate_trees, frequency=frequency) # self.generate_all_commits() for c in self.get_contents(): pass for d in self.get_directories(): pass for rev in self.get_revisions(): pass for rel in self.get_releases(): print(rel) self.visit = 'foo' for o in self.get_occurrences(): print(o) def main(): if len(sys.argv) > 1: test_repo = sys.argv[1] else: print("Please pass in the path to an HG repository.") quit() while test_repo[-1] == '/': test_repo = test_repo[:-1] if len(sys.argv) > 2: validate_frequency = float(sys.argv[2]) else: validate_frequency = 0.001 HgLoaderValidater().runtest(test_repo, True, True, validate_frequency) if __name__ == '__main__': main() diff --git a/swh/loader/mercurial/bundle20_reader.py b/swh/loader/mercurial/bundle20_reader.py index daf8b46..0ef827d 100644 --- a/swh/loader/mercurial/bundle20_reader.py +++ b/swh/loader/mercurial/bundle20_reader.py @@ -1,604 +1,605 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains code for extracting all of the data from Mercurial version 2 bundle file. It is referenced by bundle20_loader.py """ # ============================================================================= # ============================================================================= # BACKGROUND # ============================================================================= # ============================================================================= # # https://www.mercurial-scm.org/wiki/BundleFormat says: # "The new bundle format design is described on the BundleFormat2 page." # # https://www.mercurial-scm.org/wiki/BundleFormat2#Format_of_the_Bundle2_Container says: # noqa # "The latest description of the binary format can be found as comment in the # Mercurial source code." # # https://www.mercurial-scm.org/repo/hg/file/default/mercurial/help/internals/bundles.txt says: # noqa # "The 'HG20' format is not yet documented here. See the inline comments in # 'mercurial/exchange.py' for now." # # ----------------------------------------------------------------------------- # Avi says: # ----------------------------------------------------------------------------- # # All of the above official(?) statements seem to be quite wrong. # # The mercurial-scm wiki is a cluster#@*& of missing pages, bad links, wrong # information, obsolete information, undecipherable names, and half-started # leavings that only sort of look like content. I don't understand who or what # it's there for. I think that means it's not there for me? # # https://www.mercurial-scm.org/wiki/BundleFormat2#New_header is wrong and # bizarre, and there isn't any other information on the page. # # https://www.mercurial-scm.org/repo/hg/file/de86a6872d06/mercurial/help/internals/changegroups.txt # noqa # is very close to what we need. It is accurate, current, and thorough. # It describes much of the internal structure, which is super helpful if you # know in advance which info can be trusted, but it doesn't describe any of the # file-level details, including the file headers and that the entire bundle # is broken into overlayed 4KB chunks starting from just after the bundle # header, nor does it describe what any of the component elements are used for, # nor does it explain the meta-message segment in the blob deltas, nor does it # explain the file flags occasionally appended to manifest file hashes. Also it # says: "The [delta data] format is described more fully in 'hg help # internals.bdiff'", which is also wrong. As far as I can tell, that file has # never existed. # # It does however have one potentially extremely useful note buried in the # middle that, in hindsight, could have significant implications for complexity # and performance in future Mercurial loading work. # # It says: "In version 1, the delta is always applied against the previous node # from the changegroup or the first parent if this is the first entry in the # changegroup." # # If the next version of HG support for SWH can reliably get version 1 data, # then it could be implemented entirely without worrying about ballooning # memory utilization, which would shrink the code significantly and probably be # faster too. So maybe HG10 bundles instead of HG20 bundles are superior for # this task? But then I read that servers can optionally disable serving # version 1 content, and I like to think that this loader could eventually # be applied directly to a network stream without an intermediate phase for # cloning and local bundling, so...It seemed like a good idea at the time? # # ----------------------------------------------------------------------------- # Other notes and thoughts: # ----------------------------------------------------------------------------- # 1) # This is a relatively minor detail, but # Mercurial nodes are not content-addressable like Git's are. # # https://www.mercurial-scm.org/wiki/Nodeid explains: "If you modify a file, # commit the change, and then modify it to restore the original contents, the # contents are the same but the history is different, so the file will get a # new nodeid. This history-sensitivity is obtained by calculating the nodeid # from the concatentation of the parent nodeids with the file's contents..." # # The result is that we always have to collect and hash everything at least # once in order to know if we've seen something like it before, because nothing # tells us that the node we're looking at is unique. We can use node ids for # linking disparate elements together (e.g. commit to manifest) but not for # determining whether two elements in the same group are identical in all but # descendency. So there's no way to save time on duplicate hashing. Well... # there is the copied_file blob metadata, but, lol. # # 2) # Most of the code complexity is due to dealing with 'version 2' changegroups, # for which we need to keep track of the entire history of all updates made # to a given file or working directory tree structure, because a revision # delta could be applied over any of the prior revisions all the way back to # rev 0, according to whenever files were branched/merged/uncommitted/etc. For # very large repositories with a lot of churn, this can quickly expand to # require multiple gigabytes of space, possibly exceeding RAM availability if # one desires to keep as much data resident in memory as possible to boost # performance. mozilla-unified, for instance, produces some 2 million+ blobs # (1.6 million+ unique). Nested umpteen subdirectory levels deep, those blobs # balloon into a quantity of directory subtrees that rapidly exceeds an 8GB RAM # laptop's ability to keep them all active without a good amount of care and # pruning. The code here tries to strike a balance between memory utilization # and performance. # # This problem is also referenced in the last paragraph of the previous # section, where potentially this problem wouldn't exist for 'version 1' data # if we can reliably get it. Can we? Either that or not use bundles at all, # which has other costs. # # 3) # If the list of changed files stored by the changesets had indicated which # of those changed files were added or modified and which ones were removed, # this code could be much faster. Right now we have to perform a huge number of # substring replacements (see the apply_revdata method) to produce a complete # file manifest for each commit (as a string!!!) in order to know how to get # the set of removed files from the next delta. We can intuit from every # manifest delta which files were modified or added, but I believe there's no # way to intuit which files were removed without actually having the complete # prior state and without the list of removals being explicitly given. If you # have an explicit list of all the files that were removed for any given commit # changegroup, and you combine that with the delta updates in the manifest # changegroups which detail the set of files that have been added or modified, # then you wouldn't even have to apply any of the string deltas to get a # complete understanding of the set of differences between one manifest and the # next. Not having this effective speed boost is rather unfortunate; it would # require only one extra stored byte per commit to differentiate removals and # would make extracting bundles lightning fast. # ============================================================================ ## import itertools import struct + from binascii import unhexlify from collections import OrderedDict from datetime import datetime -from chunked_reader import ChunkedFileReader -from objects import SelectiveCache +from .chunked_reader import ChunkedFileReader +from .objects import SelectiveCache def unpack(fmt_str, source): """Utility function for fetching the right number of bytes from a stream to satisfy a struct.unpack pattern. args: fmt_str: a struct.unpack string pattern (e.g. '>I' for 4 bytes big-endian) source: any IO object that has a read() method which returns an appropriate sequence of bytes """ ret = struct.unpack(fmt_str, source.read(struct.calcsize(fmt_str))) if len(ret) == 1: return ret[0] return ret class Bundle20Reader(object): """Parser for extracting data from Mercurial Bundle20 files. args: bundlefile: string name of the binary repository bundle file cache_size: int tuning parameter for the upper RAM limit used by historical data caches. The default is defined in the SelectiveCache class. """ NAUGHT_NODE = b'\x00' * 20 def __init__(self, bundlefile, cache_size=None): self.bundlefile = bundlefile bfile = open(bundlefile, 'rb', buffering=200*1024*1024) btype = bfile.read(4) # 'HG20' if btype != b'HG20': raise Exception(bundlefile, b'Not an HG20 bundle. First 4 bytes:' + btype) bfile.read(4) # '\x00\x00\x00\x00' self.params = self.read_bundle_header(bfile) print('PARAMETERS', self.params) self.num_commits = self.params[b'nbchanges'] self.filereader = ChunkedFileReader(bfile) self.cache_size = cache_size self.blobs_offset = None self.changes_offset = self.filereader.tell() self.changes_next_offset = None self.manifests_offset = None self.manifests_next_offset = None self.id_to_info = {} def read_bundle_header(self, bfile): """Parse the file header which describes the format and parameters. See the structure diagram at the top of the file for more insight. args: bfile: bundle file handle with the cursor at the start offset of the content header (the 9th byte in the file) returns: dict of decoded bundle parameters """ unpack('>I', bfile) # header length chg_len = unpack('>B', bfile) # len('CHANGEGROUP') == 11 bfile.read(chg_len) # should say 'CHANGEGROUP' unpack('>I', bfile) # probably \x00\x00\x00\x00 n_mandatory, n_advisory = unpack('>BB', bfile) # parameter counts mandatory_params = [ (key_len, val_len) for key_len, val_len in [unpack('>BB', bfile) for i in range(n_mandatory)] ] advisory_params = [ (key_len, val_len) for key_len, val_len in [unpack('>BB', bfile) for i in range(n_advisory)] ] params = {} for key_len, val_len in mandatory_params+advisory_params: key = unpack('>%ds' % key_len, bfile) val = int(unpack('>%ds' % val_len, bfile)) params[key] = val return params def revdata_iterator(self, bytes_to_read): """A chunk's revdata section is a series of start/end/length/data_delta content updates called RevDiffs that indicate components of a text diff applied to the node's basenode. The sum length of all diffs is the length indicated at the beginning of the chunk at the start of the header. See the structure diagram at the top of the file for more insight. args: bytes_to_read: int total number of bytes in the chunk's revdata yields: (int, int, read iterator) representing a single text diff component """ while bytes_to_read > 0: start_offset = unpack('>I', self.filereader) end_offset = unpack('>I', self.filereader) blocklen = unpack('>I', self.filereader) delta_it = self.filereader.read_iterator(blocklen) bytes_to_read -= (12 + blocklen) yield (start_offset, end_offset, delta_it) # RevDiff def read_chunk_header(self): """The header of a RevChunk describes the id ('node') for the current change, the commit id ('linknode') associated with this change, the parental heritage ('p1' and 'p2'), and the node to which the revdata updates will apply ('basenode'). 'linknode' is the same as 'node' when reading the commit log because any commit is already itself. 'basenode' for a changeset will be NAUGHT_NODE, because changeset chunks include complete information and not diffs. See the structure diagram at the top of the file for more insight. returns: dict of the next delta header """ header = self.filereader.read(100) header = { 'node': header[0:20], 'p1': header[20:40], 'p2': header[40:60], 'basenode': header[60:80], 'linknode': header[80:100] } return header def read_revchunk(self): """Fetch a complete RevChunk. A RevChunk contains the collection of line changes made in a particular update. header['node'] identifies which update. Commits, manifests, and files all have these. Each chunk contains an indicator of the whole chunk size, an update header, and then the body of the update as a series of text diff components. See the structure diagram at the top of the file for more insight. returns: tuple(dict, iterator) of (header, chunk data) if there is another chunk in the group, else None """ size = unpack('>I', self.filereader) - 104 if size >= 0: header = self.read_chunk_header() return (header, self.revdata_iterator(size)) else: return None # NullChunk def extract_commit_metadata(self, data): """Converts the binary commit metadata format into a dict. args: data: bytestring of encoded commit information returns: dict of decoded commit information """ parts, message = data.split(b'\n\n', 1) parts = parts.split(b'\n') commit = {} commit['message'] = message commit['manifest'] = unhexlify(parts[0]) commit['user'] = parts[1] tstamp, tz, *extra = parts[2].split(b' ') commit['time'] = datetime.fromtimestamp(float(tstamp)) commit['time_offset_seconds'] = int(tz) if extra: commit['extra'] = extra[0] commit['changed_files'] = parts[3:] return commit def skip_sections(self, num_sections=1): """Skip past sections quickly. args: num_sections: int number of sections to skip """ for i in range(num_sections): size = unpack('>I', self.filereader) while size >= 104: self.filereader.seek(size - 4, from_current=True) size = unpack('>I', self.filereader) def apply_revdata(self, revdata_it, prev_state): """Compose the complete text body for a change from component deltas. args: revdata_it: output from the revdata_iterator method prev_state: bytestring the base complete text on which the new deltas will be applied returns: (bytestring, list, list) the new complete string and lists of added and removed components (used in manifest processing) """ state = [] added = [] removed = [] next_start = 0 for delta_start, delta_end, rev_diff_it in revdata_it: removed.append(prev_state[delta_start:delta_end]) added.append(b''.join(rev_diff_it)) state.append(prev_state[next_start:delta_start]) state.append(added[-1]) next_start = delta_end state.append(prev_state[next_start:]) state = b''.join(state) return (state, added, removed) def skim_headers(self): """Get all header data from a change group but bypass processing of the contained delta components. yields: output of read_chunk_header method for all chunks in the group """ size = unpack('>I', self.filereader) - 104 while size >= 0: header = self.read_chunk_header() self.filereader.seek(size, from_current=True) yield header size = unpack('>I', self.filereader) - 104 def group_iterator(self): """Bundle sections are called groups. These are composed of one or more revision chunks of delta components. Iterate over all the chunks in a group and hand each one back. yields: see output of read_revchunk method """ revchunk = self.read_revchunk() while revchunk: # A group is terminated by a NullChunk yield revchunk # (header, revdata_iterator) revchunk = self.read_revchunk() def yield_group_objects(self, cache_hints=None, group_offset=None): """Bundles are sectioned into groups: the log of all commits, the log of all manifest changes, and a series of logs of blob changes (one for each file). All groups are structured the same way, as a series of revisions each with a series of delta components. Iterate over the current group and return the completed object data for the current update by applying all of the internal delta components to each prior revision. args: cache_hints: see build_cache_hints (this will be built automatically if not pre-built and passed in) group_offset: int file position of the start of the desired group yields: (dict, bytestring, list, list) the output from read_chunk_header followed by the output from apply_revdata """ if group_offset is not None: self.filereader.seek(group_offset) if cache_hints is None: cache_hints = self.build_cache_hints() data_cache = SelectiveCache(self.cache_size, cache_hints) # Loop over all revisions in the group data = b'' for header, revdata_it in self.group_iterator(): node = header['node'] basenode = header['basenode'] data = data_cache.fetch(basenode) or b'' data, added, removed = self.apply_revdata(revdata_it, data) data_cache.store(node, data) yield (header, data, added, removed) # each RevChunk def extract_meta_from_blob(self, data): """File revision data sometimes begins with a metadata section of dubious value. Strip it off and maybe decode it. It seems to be mostly useless. Why indicate that a file node is a copy of another node? You can already get that information from the delta header. args: data: bytestring of one revision of a file, possibly with metadata embedded at the start returns: (bytestring, dict) of (the blob data, the meta information) """ meta = {} if data.startswith(b'\x01\n'): empty, metainfo, data = data.split(b'\x01\n', 2) metainfo = b'\x01\n' + metainfo + b'\x01\n' if metainfo.startswith(b'copy:'): # direct file copy (?) copyinfo = metainfo.split(b'\n') meta['copied_file'] = copyinfo[0][6:] meta['copied_rev'] = copyinfo[1][9:] elif metainfo.startswith(b'censored:'): # censored revision deltas must be full-replacements (?) meta['censored'] = metainfo else: # no idea meta['text'] = metainfo return data, meta def seek_changelog(self): """Seek to the beginning of the change logs section. """ self.filereader.seek(self.changes_offset) def seek_manifests(self): """Seek to the beginning of the manifests section. """ if self.manifests_offset is None: self.seek_changelog() self.skip_sections(1) # skip past commits self.manifests_offset = self.filereader.tell() else: self.filereader.seek(self.manifests_offset) def seek_filelist(self): """Seek to the beginning of the file changes section. """ if self.blobs_offset is None: self.seek_manifests() self.skip_sections(1) # skip past manifests self.blobs_offset = self.filereader.tell() else: self.filereader.seek(self.blobs_offset) def yield_all_blobs(self): """Gets blob data from the bundle. yields: (bytestring, (bytestring, int, dict)) of (blob data, (file name, start offset of the file within the bundle, node header)) """ self.seek_filelist() # Loop through all files that have commits size = unpack('>I', self.filereader) while size > 0: file_name = self.filereader.read(size-4) file_start_offset = self.filereader.tell() # get all of the blobs for each file for header, data, *_ in self.yield_group_objects(): blob, meta = self.extract_meta_from_blob(data) yield blob, (file_name, file_start_offset, header) size = unpack('>I', self.filereader) def yield_all_changesets(self): """Gets commit data from the bundle. yields: (dict, dict) of (read_chunk_header output, extract_commit_metadata output) """ self.seek_changelog() for header, data, *_ in self.yield_group_objects(): changeset = self.extract_commit_metadata(data) yield (header, changeset) def yield_all_manifest_deltas(self, cache_hints=None): """Gets manifest data from the bundle. In order to process the manifests in a reasonable amount of time, we want to use only the deltas and not the entire manifest at each change, because if we're processing them in sequential order (we are) then we already have the previous state so we only need the changes. args: cache_hints: see build_cache_hints method yields: (dict, dict, dict) of (read_chunk_header output, extract_manifest_elements output on added/modified files, extract_manifest_elements on removed files) """ self.seek_manifests() for header, data, added, removed in self.yield_group_objects( cache_hints=cache_hints ): added = self.extract_manifest_elements(added) removed = self.extract_manifest_elements(removed) yield (header, added, removed) def build_manifest_hints(self): """Just a minor abstraction shortcut for the build_cache_hints method. returns: see build_cache_hints method """ self.seek_manifests() return self.build_cache_hints() def build_cache_hints(self): """The SelectiveCache class that we use in building nodes can accept a set of key counters that makes its memory usage much more efficient. returns: dict of key=a node id, value=the number of times we will need data from that node when building subsequent nodes """ cur_pos = self.filereader.tell() hints = OrderedDict() prev_node = None for header in self.skim_headers(): basenode = header['basenode'] if (basenode != self.NAUGHT_NODE) and (basenode != prev_node): hints[basenode] = hints.get(basenode, 0) + 1 prev_node = header['node'] self.filereader.seek(cur_pos) return hints def extract_manifest_elements(self, data): """Parses data that looks like a manifest. In practice we only pass in the bits extracted from the application of a manifest delta describing which files were added/modified or which ones were removed. args: data: either a string or a list of strings that, when joined, embodies the composition of a manifest. This takes the form of repetitions of (without the brackets): b'\x00[flag]\n' ...repeat... where [flag] may or may not be there depending on whether the file is specially flagged as executable or something returns: dict of key=file_path, value=(file_node, permissions) where permissions is given according to the flag that optionally exists in the data """ elements = {} if isinstance(data, str): data = data.split(b'\n') else: data = itertools.chain.from_iterable( [chunk.split(b'\n') for chunk in data] ) for line in data: if line != b'': f = line.split(b'\x00') node = f[1] flag_bytes = node[40:] elements[f[0]] = ( unhexlify(node[:40]), b'l' in flag_bytes, b'755' if (b'x' in flag_bytes) else b'644' ) return elements diff --git a/swh/loader/mercurial/chunked_reader.py b/swh/loader/mercurial/chunked_reader.py index c2d1d0f..d5e953d 100644 --- a/swh/loader/mercurial/chunked_reader.py +++ b/swh/loader/mercurial/chunked_reader.py @@ -1,106 +1,107 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import struct class ChunkedFileReader(object): """A binary stream reader that gives seamless read access to Mercurial's bundle2 HG20 format which is partitioned for some reason at the file level into chunks of [4Bytes:, Bytes:] as if it were encoding transport packets. args: file: rb file handle pre-aligned to the start of the chunked portion size_unpack_fmt: struct format string for unpacking the next chunk size """ def __init__(self, file, size_unpack_fmt='>I'): self._file = file self._size_pattern = size_unpack_fmt self._size_bytes = struct.calcsize(size_unpack_fmt) self._bytes_per_chunk = self._chunk_size(True) self._chunk_bytes_left = self._bytes_per_chunk self._offset = self._file.tell() # find the file size self._file.seek(0, 2) # seek to end self._size = self._file.tell() self._file.seek(self._offset, 0) # seek back to original position def _chunk_size(self, first_time=False): """Unpack the next bytes from the file to get the next file chunk size. """ size = struct.unpack(self._size_pattern, self._file.read(self._size_bytes))[0] return size def size(self): """Returns the file size in bytes. """ return self._size def read(self, bytes_to_read): """Return N bytes from the file as a single block. args: bytes_to_read: int number of bytes of content """ return b''.join(self.read_iterator(bytes_to_read)) def read_iterator(self, bytes_to_read): """Return a generator that yields N bytes from the file one file chunk at a time. args: bytes_to_read: int number of bytes of content """ while bytes_to_read > self._chunk_bytes_left: yield self._file.read(self._chunk_bytes_left) bytes_to_read -= self._chunk_bytes_left self._chunk_bytes_left = self._chunk_size() self._chunk_bytes_left -= bytes_to_read yield self._file.read(bytes_to_read) def seek(self, new_pos=None, from_current=False): """Wraps the underlying file seek, additionally updating the chunk_bytes_left counter appropriately so that we can start reading from the new location. args: new_pos: new cursor byte position from_current: if True, it treats new_pos as an offset from the current cursor position, bypassing any chunk boundaries as if they weren't there. This should give the same end position as a read except without the reading data part. """ if from_current: new_pos = new_pos or 0 # None -> current if new_pos <= self._chunk_bytes_left: new_pos += self._file.tell() else: new_pos += ( self._file.tell() + self._size_bytes + ( ( (new_pos - self._chunk_bytes_left - 1) # aliasing // self._bytes_per_chunk ) * self._size_bytes ) ) else: new_pos = new_pos or self._offset # None -> start position self._chunk_bytes_left = self._bytes_per_chunk - ( (new_pos - self._offset) % (self._bytes_per_chunk + self._size_bytes) ) self._file.seek(new_pos) def __getattr__(self, item): """Forward other calls to the underlying file object. """ return getattr(self._file, item) diff --git a/swh/loader/mercurial/objects.py b/swh/loader/mercurial/objects.py index 8e1a9c0..347b38c 100644 --- a/swh/loader/mercurial/objects.py +++ b/swh/loader/mercurial/objects.py @@ -1,398 +1,398 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains various helper classes used in converting Mercurial bundle files into SWH Contents, Directories, etc. """ import binascii import copy import os import sys -from collections import OrderedDict +from collections import OrderedDict from sqlitedict import SqliteDict from swh.model import identifiers OS_PATH_SEP = os.path.sep.encode('utf-8') class SimpleBlob: """Stores basic metadata of a blob object.when constructing deep trees from commit file manifests. args: file_hash: unique hash of the file contents is_symlink: (bool) is this file a symlink? file_perms: (string) 3 digit permission code as a string or bytestring, e.g. '755' or b'755' """ kind = 'file' def __init__(self, file_hash, is_symlink, file_perms): self.hash = file_hash self.perms = 0o100000 + int(file_perms, 8) if is_symlink: self.perms += 0o020000 def __str__(self): return ('SimpleBlob: ' + str(self.hash) + ' -- ' + str(self.perms)) def __eq__(self, other): return ((self.perms == other.perms) and (self.hash == other.hash)) def size(self): """Return the size in byte.""" return sys.getsizeof(self) + sys.getsizeof(self.__dict__) class SimpleTree(dict): """ Stores data for a nested directory object. Uses shallow cloning to stay compact after forking and change monitoring for efficient re-hashing. """ kind = 'dir' perms = 0o040000 def __init__(self): self.hash = None self._size = None def __eq__(self, other): return ((self.hash == other.hash) and (self.items() == other.items())) def _new_tree_node(self, path): """Deeply nests SimpleTrees according to a given subdirectory path and returns a reference to the deepest one. args: path: bytestring containing a relative path from self to a deep subdirectory. e.g. b'foodir/bardir/bazdir' returns: the new node """ node = self for d in path.split(OS_PATH_SEP): if node.get(d): if node[d].hash is not None: node[d] = copy.copy(node[d]) node[d].hash = None node[d]._size = None else: node[d] = SimpleTree() node = node[d] return node def remove_tree_node_for_path(self, path): """Deletes a SimpleBlob or SimpleTree from inside nested SimpleTrees according to the given relative file path, and then recursively removes any newly depopulated SimpleTrees. It keeps the old history by doing a shallow clone before any change. args: path: bytestring containing a relative path from self to a nested file or directory. e.g. b'foodir/bardir/bazdir/quxfile.txt' returns: the new root node """ node = self if node.hash is not None: node = copy.copy(node) node.hash = None node._size = None first, sep, rest = path.partition(OS_PATH_SEP) if rest: node[first] = node[first].remove_tree_node_for_path(rest) if len(node[first]) == 0: del node[first] else: del node[first] return node def add_blob(self, file_path, file_hash, is_symlink, file_perms): """Shallow clones the root node and then deeply nests a SimpleBlob inside nested SimpleTrees according to the given file path, shallow cloning all all intermediate nodes and marking them as changed and in need of new hashes. args: file_path: bytestring containing the relative path from self to a nested file file_hash: primary identifying hash computed from the blob contents is_symlink: True/False whether this item is a symbolic link file_perms: int or string representation of file permissions returns: the new root node """ root = self if root.hash is not None: root = copy.copy(root) root.hash = None root._size = None node = root fdir, fbase = os.path.split(file_path) if fdir: node = root._new_tree_node(fdir) node[fbase] = SimpleBlob(file_hash, is_symlink, file_perms) return root def yield_SWH_directories(self): """Converts nested SimpleTrees into a stream of SWH Directories. yields: an SWH Directory for every node in the tree """ for k, v in sorted(self.items()): if isinstance(v, SimpleTree): yield from v.yield_SWH_directories() yield { 'id': self.hash, 'entries': [ { 'name': k, 'perms': v.perms, 'type': v.kind, 'target': v.hash } for k, v in sorted(self.items()) ] } def hash_changed(self, new_dirs=None): """Computes and sets primary indentifier hashes for unhashed subtrees. args: new_dirs (optional): an empty list to be populated with the SWH Directories for all of the new (not previously hashed) nodes returns: the top level hash of the whole tree """ if self.hash is None: directory = { 'entries': [ { 'name': k, 'perms': v.perms, 'type': v.kind, 'target': (v.hash if v.hash is not None else v.hash_changed(new_dirs)) } for k, v in sorted(self.items()) ] } self.hash = binascii.unhexlify( identifiers.directory_identifier(directory) ) directory['id'] = self.hash if new_dirs is not None: new_dirs.append(directory) return self.hash def flatten(self, _curpath=None, _files=None): """Converts nested sub-SimpleTrees and SimpleBlobs into a list of file paths. Useful for counting the number of files in a manifest. returns: a flat list of all of the contained file paths """ _curpath = _curpath or b'' _files = _files or {} for k, v in sorted(self.items()): p = os.path.join(_curpath, k) if isinstance(v, SimpleBlob): _files[p] = (v.hash, v.perms) else: v.flatten(p, _files) return _files def size(self): """Return the (approximate?) memory utilization in bytes of the nested structure. """ if self._size is None: self._size = ( sys.getsizeof(self) + sys.getsizeof(self.__dict__) + sum([ sys.getsizeof(k)+v.size() for k, v in self.items() ]) ) return self._size class SelectiveCache(OrderedDict): """Special cache for storing past data upon which new data is known to be dependent. Optional hinting of how many instances of which keys will be needed down the line makes utilization more efficient. And, because the distance between related data can be arbitrarily long and the data fragments can be arbitrarily large, a disk-based secondary storage is used if the primary RAM-based storage area is filled to the designated capacity. Storage is occupied in three phases: 1) The most recent key/value pair is always held, regardless of other factors, until the next entry replaces it. 2) Stored key/value pairs are pushed into a randomly accessible expanding buffer in memory with a stored size function, maximum size value, and special hinting about which keys to store for how long optionally declared at instantiation. 3) The in-memory buffer pickles into a randomly accessible disk-backed secondary buffer when it becomes full. Occupied space is calculated by default as whatever the len() function returns on the values being stored. This can be changed by passing in a new size_function at instantiation. The cache_hints parameter is a dict of key/int pairs recording how many subsequent fetches that particular key's value should stay in storage for before being erased. If you provide a set of hints and then try to store a key that is not in that set of hints, the cache will store it only while it is the most recent entry, and will bypass storage phases 2 and 3. """ DEFAULT_SIZE = 800*1024*1024 # bytes or whatever def __init__(self, max_size=None, cache_hints=None, size_function=None): """args: max_size: integer value indicating the maximum size of the part of storage held in memory cache_hints: dict of key/int pairs as described in the class description size_function: callback function that accepts one parameter and returns one int, which should probably be the calculated size of the parameter """ self._max_size = max_size or SelectiveCache.DEFAULT_SIZE self._disk = None if size_function is None: self._size_function = sys.getsizeof else: self._size_function = size_function self._latest = None self._cache_size = 0 self._cache_hints = copy.copy(cache_hints) or None def store(self, key, data): """Primary method for putting data into the cache. args: key: any hashable value data: any python object (preferably one that is measurable) """ self._latest = (key, data) if (self._cache_hints is not None) and (key not in self._cache_hints): return # cache the completed data... self._cache_size += self._size_function(data) + 53 # ...but limit memory expenditure for the cache by offloading to disk should_commit = False while ( self._cache_size > self._max_size and len(self) > 0 ): should_commit = True k, v = self.popitem(last=False) self._cache_size -= self._size_function(v) - 53 self._diskstore(k, v) if should_commit: self._disk.commit(blocking=False) self[key] = data def _diskstore(self, key, value): if self._disk is None: self._disk = SqliteDict(autocommit=False, journal_mode='OFF') self._disk[key] = value def has(self, key): """Tests whether the data for the provided key is being stored. args: key: the key of the data whose storage membership property you wish to discover returns: True or False """ return ( (self._latest and self._latest[0] == key) or (key in self) or (self._disk and (key in self._disk)) ) def fetch(self, key): """Pulls a value out of storage and decrements the hint counter for the given key. args: key: the key of the data that you want to retrieve returns: the retrieved value or None """ retval = None if self._latest and self._latest[0] == key: retval = self._latest[1] if retval is None: retval = self.get(key) if (retval is None) and self._disk: self._disk.commit(blocking=False) retval = self._disk.get(key) or None self.dereference(key) return retval def dereference(self, key): """Remove one instance of expected future retrieval of the data for the given key. This is called automatically by fetch requests that aren't satisfied by phase 1 of storage. args: the key of the data for which the future retrievals hint is to be decremented """ newref = self._cache_hints and self._cache_hints.get(key) if newref: newref -= 1 if newref == 0: del self._cache_hints[key] if key in self: item = self[key] self._cache_size -= self._size_function(item) del self[key] else: if self._disk: del self._disk[key] else: self._cache_hints[key] = newref def keys(self): yield from self.keys() if self._disk: yield from self._disk.keys() def values(self): yield from self.values() if self._disk: yield from self._disk.values() def items(self): yield from self.items() if self._disk: yield from self._disk.items() diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py index 81b5d2d..e9b8ab5 100644 --- a/swh/loader/mercurial/slow_loader.py +++ b/swh/loader/mercurial/slow_loader.py @@ -1,416 +1,415 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING WARNING WARNING WARNING # hglib is too slow to be super useful. Unfortunately it's also the only # python3 library for mercurial as of this writing. - Avi import datetime -import os - import hglib +import os from swh.model import identifiers from . import base, converters from .archive_extract import tmp_extract # TODO: What should this be? # swh-model/identifiers.py:identifier_to_bytes has a restrictive length check # in it which prevents using blake2 with hashutil.hash_to_hex ALGO = 'sha1_git' OS_PATH_SEP = os.path.sep.encode('utf-8') class SimpleBlob: """ Stores basic metadata for a blob object. """ kind = 'file' def __init__(self, file_hash, file_mode): self.hash = file_hash if not isinstance(file_mode, int): self.mode = 0o100000 + int(file_mode, 8) else: self.mode = file_mode class SimpleTree(dict): """ Stores metadata for a nested 'tree'-like object. """ kind = 'dir' mode = 0o040000 def add_tree_node_for_path(self, path): """Deeply nests SimpleTrees according to a directory path and returns a cursor to the deepest one""" node = self for d in path.split(OS_PATH_SEP): node = node.setdefault(d, SimpleTree()) return node def remove_tree_node_for_path(self, path): """Deletes a SimpleBlob from inside nested SimpleTrees according to the given file path""" first, sep, rest = path.partition(OS_PATH_SEP) if rest: self[first].remove_tree_node_for_path(rest) if not self.get(first): del self[first] else: del self[first] def add_blob(self, file_path, file_hash, file_mode): """Deeply nests a SimpleBlob inside nested SimpleTrees according to the given file path""" fdir = os.path.dirname(file_path) fbase = os.path.basename(file_path) if fdir: node = self.add_tree_node_for_path(fdir) else: node = self node[fbase] = SimpleBlob(file_hash, file_mode) class HgLoader(base.BaseLoader): """Load a mercurial repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/hg-loader' def prepare(self, origin_url, directory, fetch_date): """see base.BaseLoader.prepare""" self.origin_url = origin_url self.repo = hglib.open(directory) self.fetch_date = fetch_date self.node_to_blob_hash = {} self.blob_hash_to_file_rev = {} self.commit_trees = {} self.unique_trees = {} self.revisions = {} def get_origin(self): """Get the origin that is currently being loaded in format suitable for swh.storage""" return { 'type': 'hg', 'url': self.origin_url } def fetch_data(self): """Fetch the data from the data source""" pass def has_contents(self): """Checks whether we need to load contents""" # if we have any revisions, then obviously we have contents. return self.has_revisions() def iter_changelog(self): """Iterate over the repository log""" yield from self.repo.log('0:tip', removed=True) def get_node_file_if_new(self, f, rev, node_hash): """Load a blob from disk""" # Fast if the node hash is already cached. Somehow this shortcuts a # meaningful but not huge percentage of the loads for a repository. if node_hash not in self.node_to_blob_hash: file_path = os.path.join(self.repo.root(), f) data = self.repo.cat([file_path], rev) blob_hash = identifiers.content_identifier( {'data': data} )[ALGO] self.node_to_blob_hash[node_hash] = blob_hash if blob_hash not in self.blob_hash_to_file_rev: # new blob self.blob_hash_to_file_rev[blob_hash] = (file_path, rev) return blob_hash, data return self.node_to_blob_hash[node_hash], None def get_content_ids(self): """Get all the contents, but trim away the actual data""" self.node_to_blob_hash = {} self.blob_hash_to_file_rev = {} self.num_contents = 0 for li in self.iter_changelog(): c = self.repo[li] rev = c.rev() manifest = c.manifest() for f in c.added() + c.modified(): node_hash = manifest[f] blob_hash, data = self.get_node_file_if_new(f, rev, node_hash) if data is not None: # new blob self.num_contents += 1 yield converters.data_to_content_id(data) def get_contents(self): """Get the contents that need to be loaded""" # This method unfortunately loads and hashes the blobs twice. max_content_size = self.config['content_size_limit'] missing_contents = set( self.storage.content_missing( self.get_content_ids(), ALGO ) ) for oid in missing_contents: file_path, rev = self.blob_hash_to_file_rev[oid] data = self.repo.cat([file_path], rev) yield converters.blob_to_content_dict( data, max_content_size, self.log, self.origin_id ) def has_directories(self): """Checks whether we need to load directories""" # if we have any revs, we must also have dirs return self.has_revisions() def get_directories(self): """Get the directories that need to be loaded""" missing_dirs = set(self.storage.directory_missing( sorted(self.unique_trees.keys()) )) for dir_hash in missing_dirs: yield self.unique_trees[dir_hash] def has_revisions(self): """Checks whether we need to load revisions""" self.num_revisions = int(self.repo.tip()[0]) + 1 return self.num_revisions > 0 def update_tree_from_rev(self, tree, rev, only_these_files=None): """Iterates over changes in a revision and adds corresponding SimpleBlobs to a SimpleTree""" if rev >= 0: manifest = {k[4]: k for k in self.repo.manifest(rev=rev)} loop_keys = only_these_files or manifest.keys() for f in loop_keys: node_hash = manifest[f][0] file_mode = manifest[f][1] file_hash, _ = self.get_node_file_if_new(f, rev, node_hash) tree.add_blob(f, file_hash, file_mode) return tree def reconstruct_tree(self, directory): """Converts a flat directory into nested SimpleTrees.""" # This method exists because the code was already written to use # SimpleTree before then reducing memory use and converting to the # canonical format. A refactor using lookups instead of nesting could # obviate the need. new_tree = SimpleTree() for entry in directory['entries']: tgt = entry['target'] perms = entry['perms'] name = entry['name'] if tgt in self.unique_trees: # subtree new_tree[name] = self.reconstruct_tree(self.unique_trees[tgt]) else: # blob new_tree[name] = SimpleBlob(tgt, perms) new_tree.hash = directory['id'] return new_tree def collapse_tree(self, tree): """Converts nested SimpleTrees into multiple flat directories.""" # This method exists because the code was already written to use # SimpleTree before then reducing memory use and converting to the # canonical format. A refactor using lookups instead of nesting could # obviate the need. directory = { 'entries': [ { 'name': k, 'perms': v.mode, 'type': v.kind, 'target': (isinstance(v, SimpleBlob) and v.hash or self.collapse_tree(v)) } for k, v in tree.items() ] } tree.hash = identifiers.directory_identifier(directory) directory['id'] = tree.hash self.unique_trees[tree.hash] = directory return tree.hash def get_revision_ids(self): """Get the revisions that need to be loaded""" self.unique_trees = {} commit_tree = None for li in self.iter_changelog(): c = self.repo[li[1]] rev = c.rev() # start from the parent state p1 = c.p1().rev() if p1 in self.commit_trees: if p1 != rev-1: # Most of the time, a revision will inherit from the # previous one. In those cases we can reuse commit_tree, # otherwise build a new one here. parent_tree = self.unique_trees[self.commit_trees[p1]] commit_tree = self.reconstruct_tree(parent_tree) else: commit_tree = self.update_tree_from_rev(SimpleTree(), p1) # remove whatever is removed for f in c.removed(): commit_tree.remove_tree_node_for_path(f) # update whatever is updated self.update_tree_from_rev(commit_tree, rev, c.added()+c.modified()) self.commit_trees[rev] = self.collapse_tree(commit_tree) date_dict = identifiers.normalize_timestamp( int(c.date().timestamp()) ) author_dict = converters.parse_author(c.author()) revision = { 'author': author_dict, 'date': date_dict, 'committer': author_dict, 'committer_date': date_dict, 'type': 'hg', 'directory': commit_tree.hash, 'message': c.description(), 'metadata': { 'extra_headers': [ ['phase', c.phase()], ['rev', rev], ['hidden', c.hidden()] ] }, 'synthetic': False, 'parents': [ self.revisions[p.node()]['id'] for p in c.parents() if p.rev() >= 0 ] } revision['id'] = identifiers.revision_identifier(revision) self.revisions[c.node()] = revision for n, r in self.revisions.items(): yield {'node': n, 'id': r['id']} def get_revisions(self): """Get the revision identifiers from the repository""" revs = {r['id']: r['node'] for r in self.get_revision_ids()} missing_revs = set(self.storage.revision_missing(revs.keys())) for r in missing_revs: yield self.revisions[revs[r]] def has_releases(self): """Checks whether we need to load releases""" self.num_releases = len([t for t in self.repo.tags() if not t[3]]) return self.num_releases > 0 def get_releases(self): """Get the releases that need to be loaded""" releases = {} for t in self.repo.tags(): islocal = t[3] name = t[0] if (name != b'tip' and not islocal): short_hash = t[2] target = self.revisions[self.repo[short_hash].node()]['id'] release = { 'name': name, 'target': target, 'target_type': 'revision', 'message': None, 'metadata': None, 'synthetic': False, 'author': None, 'date': None } id_hash = identifiers.release_identifier(release) release['id'] = id_hash releases[id_hash] = release missing_rels = set(self.storage.release_missing( sorted(releases.keys()) )) yield from (releases[r] for r in missing_rels) def has_occurrences(self): """Checks whether we need to load occurrences""" self.num_occurrences = len( self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0] ) return self.num_occurrences > 0 def get_occurrences(self): """Get the occurrences that need to be loaded""" for t in ( self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0] ): name = t[0] short_hash = t[2] target = self.revisions[self.repo[short_hash].node()]['id'] yield { 'branch': name, 'origin': self.origin_id, 'target': target, 'target_type': 'revision', 'visit': self.visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': self.num_contents, 'directories': len(self.unique_trees), 'revisions': self.num_revisions, 'releases': self.num_releases, 'occurrences': self.num_occurrences, } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True class HgLoaderFromArchive(HgLoader): """Load an HG repository from a compressed archive. """ def prepare(self, origin_url, archive_path, fetch_date): tmpdir = tmp_extract(archive_path, tmpdir_prefix='swh.loader.hg.', log=self.log, source=origin_url) super().prepare(origin_url, tmpdir.name, fetch_date) if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = HgLoader() origin_url = sys.argv[1] directory = sys.argv[2] fetch_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, fetch_date))