diff --git a/swh/loader/mercurial/archive_extract.py b/swh/loader/mercurial/archive_extract.py index ed5d71e..42ca6a6 100644 --- a/swh/loader/mercurial/archive_extract.py +++ b/swh/loader/mercurial/archive_extract.py @@ -1,45 +1,44 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import patoolib def tmp_extract(archive, prefix=None, log=None, source=None): """Extract an archive to a temporary location with optional logs. Args: archive (string): Absolute path of the archive to be extracted prefix (string): Optional modifier to the temporary storage directory name. (I guess in case something goes wrong and you want to go look?) log (python logging instance): Optional for recording extractions. source (string): Optional source URL of the archive for adding to log messages. Returns: A context manager for a temporary directory that automatically removes itself. See: help(tempfile.TemporaryDirectory) """ archive_base = os.path.basename(archive) if archive_base[0] == '.': package = '.' + archive_base.split('.')[1] else: package = archive_base.split('.')[0] - tmpdir = tempfile.TemporaryDirectory(prefix=prefix) + tmpdir = tempfile.mkdtemp(prefix=prefix) patoolib.extract_archive(archive, interactive=False, outdir=tmpdir) - - repo_path = os.path.join(tmpdir.name, package) + repo_path = os.path.join(tmpdir, package) if log is not None: logstr = '' if source is not None: logstr = 'From %s - ' % source - log.info(logstr + 'Uncompressing archive %s at %s' % ( - archive_base, repo_path)) + log.info('%sUncompressing archive %s at %s' % ( + logstr, archive_base, repo_path)) return tmpdir diff --git a/swh/loader/mercurial/bundle20_loader.py b/swh/loader/mercurial/bundle20_loader.py index 7ba54bc..ed4e57b 100644 --- a/swh/loader/mercurial/bundle20_loader.py +++ b/swh/loader/mercurial/bundle20_loader.py @@ -1,313 +1,370 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains a SWH loader for ingesting repository data from Mercurial version 2 bundle files. """ # NOTE: The code here does expensive work twice in places because of the # intermediate need to check for what is missing before sending to the database # and the desire to not juggle very large amounts of data. # TODO: Decide whether to also serialize to disk and read back more quickly # from there. Maybe only for very large repos and fast drives. # - Avi import hglib import os from shutil import rmtree from tempfile import mkdtemp from swh.model import hashutil, identifiers from swh.loader.core.loader import SWHStatelessLoader from swh.loader.core.converters import content_for_storage from . import converters +from .archive_extract import tmp_extract from .bundle20_reader import Bundle20Reader from .converters import PRIMARY_ALGO as ALGO from .objects import SelectiveCache, SimpleTree class HgBundle20Loader(SWHStatelessLoader): + """Mercurial loader able to deal with remote or local repository. + + """ CONFIG_BASE_FILENAME = 'loader/hg' ADDITIONAL_CONFIG = { 'bundle_filename': ('str', 'HG20_none_bundle'), } - def __init__(self): - super().__init__(logging_class='swh.loader.mercurial.Bundle20Loader') + def __init__(self, logging_class='swh.loader.mercurial.Bundle20Loader'): + super().__init__(logging_class=logging_class) self.content_max_size_limit = self.config['content_size_limit'] self.bundle_filename = self.config['bundle_filename'] self.hg = None self.tags = [] - self.working_directory = mkdtemp(suffix='.tmp', - prefix='swh.loader.mercurial.', - dir='/tmp') def cleanup(self): """Clean temporary working directory """ - if os.path.exists(self.working_directory): + if self.bundle_path and os.path.exists(self.bundle_path): + self.log.debug('Cleanup up working bundle %s' % self.bundle_path) + os.unlink(self.bundle_path) + if self.working_directory and os.path.exists(self.working_directory): + self.log.debug('Cleanup up working directory %s' % ( + self.working_directory, )) rmtree(self.working_directory) - def prepare(self, origin_url, directory, visit_date): - """see base.BaseLoader.prepare""" - self.origin_url = origin_url - self.origin = self.get_origin() - self.visit_date = visit_date - - wd = os.path.join(self.working_directory, directory) - os.makedirs(wd, exist_ok=True) - self.hgdir = wd + def prepare(self, origin_url, visit_date, directory=None): + """Prepare the necessary steps to load an actual remote or local + repository. - bundle_path = os.path.join(self.hgdir, self.bundle_filename) - self.log.debug('Cloning %s to %s' % (self.origin_url, self.hgdir)) - hglib.clone(source=self.origin_url, dest=self.hgdir) + To load a local repository, pass the optional directory + parameter as filled with a path to a real local folder. - self.log.debug('Bundling at %s' % bundle_path) - with hglib.open(directory) as repo: - repo.bundle(bytes(bundle_path, 'utf-8'), - all=True, - type=b'none') + To load a remote repository, pass the optional directory + parameter as None. - self.br = Bundle20Reader(bundle_path) + """ + self.origin_url = origin_url + self.origin = self.get_origin() + self.visit_date = visit_date + self.working_directory = None + self.bundle_path = None + + try: + if not directory: # remote repository + self.working_directory = mkdtemp( + suffix='.tmp', + prefix='swh.loader.mercurial.', + dir='/tmp') + os.makedirs(self.working_directory, exist_ok=True) + self.hgdir = self.working_directory + + self.log.debug('Cloning %s to %s' % ( + self.origin_url, self.hgdir)) + hglib.clone(source=self.origin_url, dest=self.hgdir) + else: # local repository + self.working_directory = None + self.hgdir = directory + + self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) + self.log.debug('Bundling at %s' % self.bundle_path) + with hglib.open(self.hgdir) as repo: + repo.bundle(bytes(self.bundle_path, 'utf-8'), + all=True, + type=b'none') + except: + self.cleanup() + raise + + self.br = Bundle20Reader(self.bundle_path) def get_origin(self): """Get the origin that is currently being loaded in format suitable for swh.storage.""" return { 'type': 'hg', 'url': self.origin_url } def fetch_data(self): """Fetch the data from the data source.""" pass def get_contents(self): """Get the contents that need to be loaded.""" self.file_node_to_hash = {} missing_contents = set() hash_to_info = {} self.num_contents = 0 contents = {} for blob, node_info in self.br.yield_all_blobs(): self.num_contents += 1 file_name = node_info[0] header = node_info[2] content = hashutil.hash_data(blob, with_length=True) content['data'] = blob blob_hash = content[ALGO] self.file_node_to_hash[header['node']] = blob_hash hash_to_info[blob_hash] = node_info missing_contents.add(blob_hash) contents[blob_hash] = content if file_name == b'.hgtags': # https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model # overwrite until the last one self.tags = (t for t in blob.split(b'\n') if t != b'') missing_contents = set( self.storage.content_missing( (contents[h] for h in missing_contents), key_hash=ALGO ) ) # Clusters needed blobs by file offset and then only fetches the # groups at the needed offsets. focs = {} # "file/offset/contents" for blob_hash in missing_contents: _, file_offset, header = hash_to_info[blob_hash] focs.setdefault(file_offset, {}) focs[file_offset][header['node']] = blob_hash hash_to_info = None for offset, node_hashes in sorted(focs.items()): for header, data, *_ in self.br.yield_group_objects( group_offset=offset ): node = header['node'] if node in node_hashes: h = node_hashes[node] yield content_for_storage( contents[h], log=self.log, max_content_size=self.content_max_size_limit, origin_id=self.origin_id ) def load_directories(self): """This is where the work is done to convert manifest deltas from the repository bundle into SWH directories. """ self.mnode_to_tree_id = {} base_manifests = self.br.build_manifest_hints() def tree_size(t): return t.size() self.trees = SelectiveCache(cache_hints=base_manifests, size_function=tree_size) tree = SimpleTree() for header, added, removed in self.br.yield_all_manifest_deltas( base_manifests ): node = header['node'] basenode = header['basenode'] tree = self.trees.fetch(basenode) or tree # working tree for path in removed.keys(): tree = tree.remove_tree_node_for_path(path) for path, info in added.items(): file_node, is_symlink, perms_code = info tree = tree.add_blob( path, self.file_node_to_hash[file_node], is_symlink, perms_code ) new_dirs = [] self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs) self.trees.store(node, tree) yield header, tree, new_dirs def get_directories(self): """Get the directories that need to be loaded.""" missing_dirs = [] self.num_directories = 0 for header, tree, new_dirs in self.load_directories(): for d in new_dirs: self.num_directories += 1 missing_dirs.append(d['id']) missing_dirs = set( self.storage.directory_missing(missing_dirs) ) for header, tree, new_dirs in self.load_directories(): for d in new_dirs: if d['id'] in missing_dirs: yield d def get_revisions(self): """Get the revisions that need to be loaded.""" self.branches = {} revisions = {} self.num_revisions = 0 for header, commit in self.br.yield_all_changesets(): self.num_revisions += 1 date_dict = identifiers.normalize_timestamp( int(commit['time'].timestamp()) ) author_dict = converters.parse_author(commit['user']) if commit['manifest'] == Bundle20Reader.NAUGHT_NODE: directory_id = SimpleTree().hash_changed() else: directory_id = self.mnode_to_tree_id[commit['manifest']] extra_meta = [] extra = commit.get('extra') if extra: for e in extra.split(b'\x00'): k, v = e.split(b':', 1) k = k.decode('utf-8') extra_meta.append([k, v]) if k == 'branch': # needed for Occurrences self.branches[v] = header['node'] revision = { 'author': author_dict, 'date': date_dict, 'committer': author_dict, 'committer_date': date_dict, 'type': 'hg', 'directory': directory_id, 'message': commit['message'], 'metadata': { 'node': hashutil.hash_to_hex(header['node']), 'extra_headers': [ ['time_offset_seconds', str(commit['time_offset_seconds']).encode('utf-8')], ] + extra_meta }, 'synthetic': False, 'parents': [ header['p1'], header['p2'] ] } revision['id'] = hashutil.hash_to_bytes( identifiers.revision_identifier(revision)) revisions[revision['id']] = revision missing_revs = revisions.keys() missing_revs = set( self.storage.revision_missing(list(missing_revs)) ) for r in missing_revs: yield revisions[r] self.mnode_to_tree_id = None def get_releases(self): """Get the releases that need to be loaded.""" self.num_releases = 0 releases = {} missing_releases = [] for t in self.tags: self.num_releases += 1 node, name = t.split(b' ') release = { 'name': name, 'target': hashutil.hash_to_bytes(node.decode()), 'target_type': 'revision', 'message': None, 'metadata': None, 'synthetic': False, 'author': {'name': None, 'email': None, 'fullname': b''}, 'date': None } id_hash = hashutil.hash_to_bytes( identifiers.release_identifier(release)) release['id'] = id_hash missing_releases.append(id_hash) releases[id_hash] = release missing_releases = set(self.storage.release_missing(missing_releases)) for _id in missing_releases: yield releases[_id] def get_occurrences(self): """Get the occurrences that need to be loaded.""" self.num_occurrences = 0 for name, target in self.branches.items(): self.num_occurrences += 1 yield { 'branch': name, 'origin': self.origin_id, 'target': target, 'target_type': 'revision', 'visit': self.visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history.""" return { 'contents': self.num_contents, 'directories': self.num_directories, 'revisions': self.num_revisions, 'releases': self.num_releases, 'occurrences': self.num_occurrences } + + +class HgArchiveBundle20Loader(HgBundle20Loader): + """Mercurial loader for repository wrapped within archives. + + """ + def __init__(self): + super().__init__( + logging_class='swh.loader.mercurial.HgArchiveBundle20Loader') + + def prepare(self, origin_url, archive_path, visit_date): + self.temp_dir = tmp_extract(archive=archive_path, + prefix='swh.loader.mercurial.', + log=self.log, + source=origin_url) + + repo_name = os.listdir(self.temp_dir)[0] + directory = os.path.join(self.temp_dir, repo_name) + try: + super().prepare(origin_url, visit_date, directory=directory) + except: + self.cleanup() + raise + + def cleanup(self): + if os.path.exists(self.temp_dir): + rmtree(self.temp_dir) + super().cleanup() diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py index ec2a3d3..d0f79d5 100644 --- a/swh/loader/mercurial/slow_loader.py +++ b/swh/loader/mercurial/slow_loader.py @@ -1,476 +1,460 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING WARNING WARNING WARNING # hglib is too slow to be super useful. Unfortunately it's also the only # python3 library for mercurial as of this writing. - Avi import datetime import hglib import os from swh.model import identifiers, hashutil from swh.loader.core.loader import SWHStatelessLoader -from .archive_extract import tmp_extract from .converters import parse_author, PRIMARY_ALGO as ALGO OS_PATH_SEP = os.path.sep.encode('utf-8') def data_to_content_id(data): size = len(data) ret = { 'length': size, } ret.update(identifiers.content_identifier({'data': data})) return ret def blob_to_content_dict(data, existing_hashes=None, max_size=None, logger=None): """Convert blob data to a SWH Content. If the blob already has hashes computed, don't recompute them. TODO: This should be unified with similar functions in other places. args: existing_hashes: dict of hash algorithm:value pairs max_size: size over which blobs should be rejected logger: logging class instance returns: A Software Heritage "content". """ existing_hashes = existing_hashes or {} size = len(data) content = { 'length': size, } content.update(existing_hashes) hash_types = list(existing_hashes.keys()) hashes_to_do = hashutil.DEFAULT_ALGORITHMS.difference(hash_types) content.update(hashutil.hash_data(data, algorithms=hashes_to_do)) if max_size and (size > max_size): content.update({ 'status': 'absent', 'reason': 'Content too large', }) if logger: id_hash = hashutil.hash_to_hex(content[ALGO]) logger.info( 'Skipping content %s, too large (%s > %s)' % (id_hash, size, max_size), extra={ 'swh_type': 'loader_content_skip', 'swh_id': id_hash, 'swh_size': size } ) else: content.update({'data': data, 'status': 'visible'}) return content class SimpleBlob: """ Stores basic metadata for a blob object. """ kind = 'file' def __init__(self, file_hash, file_mode): self.hash = file_hash if not isinstance(file_mode, int): self.mode = 0o100000 + int(file_mode, 8) else: self.mode = file_mode class SimpleTree(dict): """ Stores metadata for a nested 'tree'-like object. """ kind = 'dir' mode = 0o040000 def add_tree_node_for_path(self, path): """Deeply nests SimpleTrees according to a directory path and returns a cursor to the deepest one""" node = self for d in path.split(OS_PATH_SEP): node = node.setdefault(d, SimpleTree()) return node def remove_tree_node_for_path(self, path): """Deletes a SimpleBlob from inside nested SimpleTrees according to the given file path""" first, sep, rest = path.partition(OS_PATH_SEP) if rest: self[first].remove_tree_node_for_path(rest) if not self.get(first): del self[first] else: del self[first] def add_blob(self, file_path, file_hash, file_mode): """Deeply nests a SimpleBlob inside nested SimpleTrees according to the given file path""" fdir = os.path.dirname(file_path) fbase = os.path.basename(file_path) if fdir: node = self.add_tree_node_for_path(fdir) else: node = self node[fbase] = SimpleBlob(file_hash, file_mode) class HgLoader(SWHStatelessLoader): """Load a mercurial repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/hg' def __init__(self, logging_class='swh.loader.mercurial.HgLoader'): super().__init__(logging_class=logging_class) def prepare(self, origin_url, directory, visit_date): """see base.BaseLoader.prepare""" self.origin_url = origin_url self.repo = hglib.open(directory) self.visit_date = visit_date self.node_to_blob_hash = {} self.blob_hash_to_file_rev = {} self.commit_trees = {} self.unique_trees = {} self.revisions = {} def get_origin(self): """Get the origin that is currently being loaded in format suitable for swh.storage""" return { 'type': 'hg', 'url': self.origin_url } def fetch_data(self): """Fetch the data from the data source""" pass def has_contents(self): """Checks whether we need to load contents""" # if we have any revisions, then obviously we have contents. return self.has_revisions() def iter_changelog(self): """Iterate over the repository log""" yield from self.repo.log('0:tip', removed=True) def get_node_file_if_new(self, f, rev, node_hash): """Load a blob from disk""" # Fast if the node hash is already cached. Somehow this shortcuts a # meaningful but not huge percentage of the loads for a repository. if node_hash not in self.node_to_blob_hash: file_path = os.path.join(self.repo.root(), f) data = self.repo.cat([file_path], rev) blob_hash = identifiers.content_identifier( {'data': data} )[ALGO] self.node_to_blob_hash[node_hash] = blob_hash if blob_hash not in self.blob_hash_to_file_rev: # new blob self.blob_hash_to_file_rev[blob_hash] = (file_path, rev) return blob_hash, data return self.node_to_blob_hash[node_hash], None def get_content_ids(self): """Get all the contents, but trim away the actual data""" self.node_to_blob_hash = {} self.blob_hash_to_file_rev = {} self.num_contents = 0 for li in self.iter_changelog(): c = self.repo[li] rev = c.rev() manifest = c.manifest() for f in c.added() + c.modified(): node_hash = manifest[f] blob_hash, data = self.get_node_file_if_new(f, rev, node_hash) if data is not None: # new blob self.num_contents += 1 yield data_to_content_id(data) def get_contents(self): """Get the contents that need to be loaded""" # This method unfortunately loads and hashes the blobs twice. max_content_size = self.config['content_size_limit'] missing_contents = set( self.storage.content_missing( self.get_content_ids(), ALGO ) ) for oid in missing_contents: file_path, rev = self.blob_hash_to_file_rev[oid] data = self.repo.cat([file_path], rev) yield blob_to_content_dict( data, max_size=max_content_size, logger=self.log ) def has_directories(self): """Checks whether we need to load directories""" # if we have any revs, we must also have dirs return self.has_revisions() def get_directories(self): """Get the directories that need to be loaded""" missing_dirs = set(self.storage.directory_missing( sorted(self.unique_trees.keys()) )) for dir_hash in missing_dirs: yield self.unique_trees[dir_hash] def has_revisions(self): """Checks whether we need to load revisions""" self.num_revisions = int(self.repo.tip()[0]) + 1 return self.num_revisions > 0 def update_tree_from_rev(self, tree, rev, only_these_files=None): """Iterates over changes in a revision and adds corresponding SimpleBlobs to a SimpleTree""" if rev >= 0: manifest = {k[4]: k for k in self.repo.manifest(rev=rev)} loop_keys = only_these_files or manifest.keys() for f in loop_keys: node_hash = manifest[f][0] file_mode = manifest[f][1] file_hash, _ = self.get_node_file_if_new(f, rev, node_hash) tree.add_blob(f, file_hash, file_mode) return tree def reconstruct_tree(self, directory): """Converts a flat directory into nested SimpleTrees.""" # This method exists because the code was already written to use # SimpleTree before then reducing memory use and converting to the # canonical format. A refactor using lookups instead of nesting could # obviate the need. new_tree = SimpleTree() for entry in directory['entries']: tgt = entry['target'] perms = entry['perms'] name = entry['name'] if tgt in self.unique_trees: # subtree new_tree[name] = self.reconstruct_tree(self.unique_trees[tgt]) else: # blob new_tree[name] = SimpleBlob(tgt, perms) new_tree.hash = directory['id'] return new_tree def collapse_tree(self, tree): """Converts nested SimpleTrees into multiple flat directories.""" # This method exists because the code was already written to use # SimpleTree before then reducing memory use and converting to the # canonical format. A refactor using lookups instead of nesting could # obviate the need. directory = { 'entries': [ { 'name': k, 'perms': v.mode, 'type': v.kind, 'target': (isinstance(v, SimpleBlob) and v.hash or self.collapse_tree(v)) } for k, v in tree.items() ] } tree.hash = identifiers.directory_identifier(directory) directory['id'] = tree.hash self.unique_trees[tree.hash] = directory return tree.hash def get_revision_ids(self): """Get the revisions that need to be loaded""" self.unique_trees = {} commit_tree = None for li in self.iter_changelog(): c = self.repo[li[1]] rev = c.rev() # start from the parent state p1 = c.p1().rev() if p1 in self.commit_trees: if p1 != rev-1: # Most of the time, a revision will inherit from the # previous one. In those cases we can reuse commit_tree, # otherwise build a new one here. parent_tree = self.unique_trees[self.commit_trees[p1]] commit_tree = self.reconstruct_tree(parent_tree) else: commit_tree = self.update_tree_from_rev(SimpleTree(), p1) # remove whatever is removed for f in c.removed(): commit_tree.remove_tree_node_for_path(f) # update whatever is updated self.update_tree_from_rev(commit_tree, rev, c.added()+c.modified()) self.commit_trees[rev] = self.collapse_tree(commit_tree) date_dict = identifiers.normalize_timestamp( int(c.date().timestamp()) ) author_dict = parse_author(c.author()) revision = { 'author': author_dict, 'date': date_dict, 'committer': author_dict, 'committer_date': date_dict, 'type': 'hg', 'directory': commit_tree.hash, 'message': c.description(), 'metadata': { 'extra_headers': [ ['phase', c.phase()], ['rev', rev], ['hidden', c.hidden()] ] }, 'synthetic': False, 'parents': [ self.revisions[p.node()]['id'] for p in c.parents() if p.rev() >= 0 ] } revision['id'] = identifiers.revision_identifier(revision) self.revisions[c.node()] = revision for n, r in self.revisions.items(): yield {'node': n, 'id': r['id']} def get_revisions(self): """Get the revision identifiers from the repository""" revs = {r['id']: r['node'] for r in self.get_revision_ids()} missing_revs = set(self.storage.revision_missing(revs.keys())) for r in missing_revs: yield self.revisions[revs[r]] def has_releases(self): """Checks whether we need to load releases""" self.num_releases = len([t for t in self.repo.tags() if not t[3]]) return self.num_releases > 0 def get_releases(self): """Get the releases that need to be loaded""" releases = {} for t in self.repo.tags(): islocal = t[3] name = t[0] if (name != b'tip' and not islocal): short_hash = t[2] target = self.revisions[self.repo[short_hash].node()]['id'] release = { 'name': name, 'target': target, 'target_type': 'revision', 'message': None, 'metadata': None, 'synthetic': False, 'author': None, 'date': None } id_hash = identifiers.release_identifier(release) release['id'] = id_hash releases[id_hash] = release missing_rels = set(self.storage.release_missing( sorted(releases.keys()) )) yield from (releases[r] for r in missing_rels) def has_occurrences(self): """Checks whether we need to load occurrences""" self.num_occurrences = len( self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0] ) return self.num_occurrences > 0 def get_occurrences(self): """Get the occurrences that need to be loaded""" for t in ( self.repo.tags() + self.repo.branches() + self.repo.bookmarks()[0] ): name = t[0] short_hash = t[2] target = self.revisions[self.repo[short_hash].node()]['id'] yield { 'branch': name, 'origin': self.origin_id, 'target': target, 'target_type': 'revision', 'visit': self.visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': self.num_contents, 'directories': len(self.unique_trees), 'revisions': self.num_revisions, 'releases': self.num_releases, 'occurrences': self.num_occurrences, } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True -class HgLoaderFromArchive(HgLoader): - """Load an HG repository from a compressed archive. - - """ - def __init__(self): - super().__init__( - logging_class='swh.loader.mercurial.HgLoaderFromArchive') - - def prepare(self, origin_url, archive_path, visit_date): - tmpdir = tmp_extract(archive=archive_path, - prefix='swh.loader.hg.', - log=self.log, source=origin_url) - super().prepare(origin_url, tmpdir.name, visit_date) - - if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = HgLoader() origin_url = sys.argv[1] directory = sys.argv[2] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, visit_date)) diff --git a/swh/loader/mercurial/tasks.py b/swh/loader/mercurial/tasks.py index 1143992..11ea571 100644 --- a/swh/loader/mercurial/tasks.py +++ b/swh/loader/mercurial/tasks.py @@ -1,66 +1,43 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task -from .bundle20_loader import HgBundle20Loader -from .slow_loader import HgLoader, HgLoaderFromArchive +from .bundle20_loader import HgBundle20Loader, HgArchiveBundle20Loader class LoadMercurialTsk(Task): """Mercurial repository loading """ task_queue = 'swh_loader_mercurial' - def run_task(self, *, origin_url, directory, visit_date): + def run_task(self, *, origin_url, visit_date, directory): """Import a mercurial tarball into swh. Args: see :func:`DepositLoader.load`. """ loader = HgBundle20Loader() loader.log = self.log loader.load(origin_url=origin_url, directory=directory, visit_date=visit_date) -class SlowLoadMercurialTsk(Task): - """Mercurial repository loading - - """ - task_queue = 'swh_loader_mercurial_slow' - - def run_task(self, *, origin_url, directory, visit_date): - """Import a mercurial tarball into swh. - - Args: see :func:`DepositLoader.load`. - - """ - loader = HgLoader() - loader.log = self.log - loader.load(origin_url=origin_url, - directory=directory, - visit_date=visit_date) - - -class SlowLoadMercurialArchiveTsk(Task): - """Mercurial repository loading - - """ - task_queue = 'swh_loader_mercurial_slow_archive' +class LoadArchiveMercurialTsk(Task): + task_queue = 'swh_loader_mercurial_archive' def run_task(self, *, origin_url, archive_path, visit_date): """Import a mercurial tarball into swh. Args: see :func:`DepositLoader.load`. """ - loader = HgLoaderFromArchive() + loader = HgArchiveBundle20Loader() loader.log = self.log loader.load(origin_url=origin_url, archive_path=archive_path, visit_date=visit_date)