diff --git a/PKG-INFO b/PKG-INFO index 6b4300d..1c34d1b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,130 +1,130 @@ Metadata-Version: 2.1 Name: swh.loader.mercurial -Version: 0.0.13 +Version: 0.0.14 Summary: Software Heritage Mercurial Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDHG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-mercurial Description: swh-loader-mercurial ========================= # Configuration file In usual location for a loader, *{/etc/softwareheritage/ | ~/.swh/ | ~/.config/swh/}loader/hg.yml*: ``` YAML storage: cls: remote args: url: http://localhost:5002/ ``` # Basic use The main entry point to import a Mercurial repository is the `main` function defined in the `swh.loader.mercurial.cli` module: ``` bash python3 -m swh.loader.mercurial.cli ``` If the Python package has been installed via `pip`, you should be able to type: ``` bash user@host:~$ swh-loader-hg --help Usage: swh-loader-hg [OPTIONS] ORIGIN_URL Options: -d, --hg-directory TEXT Path to the hg (local) directory to load from. If unset, the hg repo will ben cloned from the given (origin) url -a, --hg-archive TEXT Path to the hg (local) archive file to load from. -D, --visit-date TEXT Visit date (defaults to now) -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL] Log level --help Show this message and exit. ``` For example: ``` bash user@host:~$ swh-loader-hg https://www.mercurial-scm.org/repo/hello [...] ``` # From Python From python3's toplevel: ## Remote ``` Python project = 'hello' # remote repository origin_url = 'https://www.mercurial-scm.org/repo/%s' % project # local clone directory = '/home/storage/hg/repo/%s' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadMercurial t = LoadMercurial() t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` ## local directory Only origin, contents, and directories are filled so far. Remaining objects are empty (revision, release, occurrence). ``` Python project = '756015-ipv6' directory = '/home/storage/hg/repo/%s' % project origin_url = 'https://%s.googlecode.com' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadMercurial t = LoadMercurial() t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` ## local archive ``` Python project = '756015-ipv6-source-archive.zip' archive_path = '/home/storage/hg/repo/%s' % project origin_url = 'https://%s-archive.googlecode.com' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadArchiveMercurial t = LoadArchiveMercurial() t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00') ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 4 - Beta Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.loader.mercurial.egg-info/PKG-INFO b/swh.loader.mercurial.egg-info/PKG-INFO index 6b4300d..1c34d1b 100644 --- a/swh.loader.mercurial.egg-info/PKG-INFO +++ b/swh.loader.mercurial.egg-info/PKG-INFO @@ -1,130 +1,130 @@ Metadata-Version: 2.1 Name: swh.loader.mercurial -Version: 0.0.13 +Version: 0.0.14 Summary: Software Heritage Mercurial Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDHG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-mercurial Description: swh-loader-mercurial ========================= # Configuration file In usual location for a loader, *{/etc/softwareheritage/ | ~/.swh/ | ~/.config/swh/}loader/hg.yml*: ``` YAML storage: cls: remote args: url: http://localhost:5002/ ``` # Basic use The main entry point to import a Mercurial repository is the `main` function defined in the `swh.loader.mercurial.cli` module: ``` bash python3 -m swh.loader.mercurial.cli ``` If the Python package has been installed via `pip`, you should be able to type: ``` bash user@host:~$ swh-loader-hg --help Usage: swh-loader-hg [OPTIONS] ORIGIN_URL Options: -d, --hg-directory TEXT Path to the hg (local) directory to load from. If unset, the hg repo will ben cloned from the given (origin) url -a, --hg-archive TEXT Path to the hg (local) archive file to load from. -D, --visit-date TEXT Visit date (defaults to now) -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL] Log level --help Show this message and exit. ``` For example: ``` bash user@host:~$ swh-loader-hg https://www.mercurial-scm.org/repo/hello [...] ``` # From Python From python3's toplevel: ## Remote ``` Python project = 'hello' # remote repository origin_url = 'https://www.mercurial-scm.org/repo/%s' % project # local clone directory = '/home/storage/hg/repo/%s' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadMercurial t = LoadMercurial() t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` ## local directory Only origin, contents, and directories are filled so far. Remaining objects are empty (revision, release, occurrence). ``` Python project = '756015-ipv6' directory = '/home/storage/hg/repo/%s' % project origin_url = 'https://%s.googlecode.com' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadMercurial t = LoadMercurial() t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` ## local archive ``` Python project = '756015-ipv6-source-archive.zip' archive_path = '/home/storage/hg/repo/%s' % project origin_url = 'https://%s-archive.googlecode.com' % project import logging logging.basicConfig(level=logging.DEBUG) from swh.loader.mercurial.tasks import LoadArchiveMercurial t = LoadArchiveMercurial() t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00') ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 4 - Beta Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.loader.mercurial.egg-info/SOURCES.txt b/swh.loader.mercurial.egg-info/SOURCES.txt index 363e866..bcd26bd 100644 --- a/swh.loader.mercurial.egg-info/SOURCES.txt +++ b/swh.loader.mercurial.egg-info/SOURCES.txt @@ -1,47 +1,46 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README.md requirements-swh.txt requirements-test.txt requirements.txt setup.py version.txt debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.mercurial.egg-info/PKG-INFO swh.loader.mercurial.egg-info/SOURCES.txt swh.loader.mercurial.egg-info/dependency_links.txt swh.loader.mercurial.egg-info/entry_points.txt swh.loader.mercurial.egg-info/requires.txt swh.loader.mercurial.egg-info/top_level.txt swh/loader/__init__.py swh/loader/mercurial/__init__.py swh/loader/mercurial/archive_extract.py -swh/loader/mercurial/bundle20_loader_verifier.py swh/loader/mercurial/bundle20_reader.py swh/loader/mercurial/chunked_reader.py swh/loader/mercurial/cli.py swh/loader/mercurial/converters.py swh/loader/mercurial/loader.py +swh/loader/mercurial/loader_verifier.py swh/loader/mercurial/objects.py -swh/loader/mercurial/slow_loader.py swh/loader/mercurial/tasks.py swh/loader/mercurial/tests/__init__.py swh/loader/mercurial/tests/test_loader.org swh/loader/mercurial/tests/test_loader.py swh/loader/mercurial/tests/resources/hello.tgz swh/loader/mercurial/tests/resources/the-sandbox.tgz \ No newline at end of file diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py index 7a85340..a9a1c8f 100644 --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/loader.py @@ -1,526 +1,531 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains a SWH loader for ingesting repository data from Mercurial version 2 bundle files. """ # NOTE: The code here does expensive work twice in places because of the # intermediate need to check for what is missing before sending to the database # and the desire to not juggle very large amounts of data. # TODO: Decide whether to also serialize to disk and read back more quickly # from there. Maybe only for very large repos and fast drives. # - Avi import datetime import hglib import os import random import re from dateutil import parser from shutil import rmtree from tempfile import mkdtemp from swh.model import identifiers from swh.model.hashutil import ( MultiHash, hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS ) from swh.loader.core.loader import SWHStatelessLoader from swh.loader.core.converters import content_for_storage from swh.loader.core.utils import clean_dangling_folders from . import converters from .archive_extract import tmp_extract from .bundle20_reader import Bundle20Reader from .converters import PRIMARY_ALGO as ALGO from .objects import SelectiveCache, SimpleTree TAG_PATTERN = re.compile('[0-9A-Fa-f]{40}') TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.mercurial.' +HEAD_POINTER_NAME = b'tip' + class HgBundle20Loader(SWHStatelessLoader): """Mercurial loader able to deal with remote or local repository. """ CONFIG_BASE_FILENAME = 'loader/hg' ADDITIONAL_CONFIG = { 'bundle_filename': ('str', 'HG20_none_bundle'), 'reduce_effort': ('bool', True), # default: Try to be smart about time 'temp_directory': ('str', '/tmp'), 'cache1_size': ('int', 800*1024*1024), 'cache2_size': ('int', 800*1024*1024), } def __init__(self, logging_class='swh.loader.mercurial.Bundle20Loader'): super().__init__(logging_class=logging_class) self.content_max_size_limit = self.config['content_size_limit'] self.bundle_filename = self.config['bundle_filename'] self.reduce_effort_flag = self.config['reduce_effort'] self.empty_repository = None self.temp_directory = self.config['temp_directory'] self.cache1_size = self.config['cache1_size'] self.cache2_size = self.config['cache2_size'] self.working_directory = None self.bundle_path = None def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed tasks) """ clean_dangling_folders(self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """Clean temporary working directory """ if self.bundle_path and os.path.exists(self.bundle_path): self.log.debug('Cleanup up working bundle %s' % self.bundle_path) os.unlink(self.bundle_path) if self.working_directory and os.path.exists(self.working_directory): self.log.debug('Cleanup up working directory %s' % ( self.working_directory, )) rmtree(self.working_directory) def get_heads(self, repo): """Read the closed branches heads (branch, bookmarks) and returns a - dict with branch_name (bytes) and mercurial's node id - (bytes). Those needs conversion to swh-ids. This is taken + dict with key the branch_name (bytes) and values the tuple + (pointer nature (bytes), mercurial's node id + (bytes)). Those needs conversion to swh-ids. This is taken care of in get_revisions. """ b = {} - for _, node_hash_id, _, branch_name, *_ in repo.heads(): - b[branch_name] = hash_to_bytes( - node_hash_id.decode()) + for _, node_hash_id, pointer_nature, branch_name, *_ in repo.heads(): + b[branch_name] = ( + pointer_nature, hash_to_bytes(node_hash_id.decode())) bookmarks = repo.bookmarks() if bookmarks and bookmarks[0]: for bookmark_name, _, target_short in bookmarks[0]: target = repo[target_short].node() - b[bookmark_name] = hash_to_bytes(target.decode()) + b[bookmark_name] = (None, hash_to_bytes(target.decode())) return b def prepare_origin_visit(self, *, origin_url, visit_date, **kwargs): self.origin_url = origin_url self.origin = {'url': self.origin_url, 'type': 'hg'} if isinstance(visit_date, str): # visit_date can be string or datetime visit_date = parser.parse(visit_date) self.visit_date = visit_date def prepare(self, *, origin_url, visit_date, directory=None): """Prepare the necessary steps to load an actual remote or local repository. To load a local repository, pass the optional directory parameter as filled with a path to a real local folder. To load a remote repository, pass the optional directory parameter as None. Args: origin_url (str): Origin url to load visit_date (str/datetime): Date of the visit directory (str/None): The local directory to load """ self.branches = {} self.tags = [] self.releases = {} self.node_2_rev = {} if not directory: # remote repository self.working_directory = mkdtemp( prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix='-%s' % os.getpid(), dir=self.temp_directory) os.makedirs(self.working_directory, exist_ok=True) self.hgdir = self.working_directory self.log.debug('Cloning %s to %s' % ( self.origin['url'], self.hgdir)) hglib.clone(source=self.origin['url'], dest=self.hgdir) else: # local repository self.working_directory = None self.hgdir = directory self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) self.log.debug('Bundling at %s' % self.bundle_path) with hglib.open(self.hgdir) as repo: self.heads = self.get_heads(repo) repo.bundle(bytes(self.bundle_path, 'utf-8'), all=True, type=b'none-v2') self.cache_filename1 = os.path.join( self.hgdir, 'swh-cache-1-%s' % ( hex(random.randint(0, 0xffffff))[2:], )) self.cache_filename2 = os.path.join( self.hgdir, 'swh-cache-2-%s' % ( hex(random.randint(0, 0xffffff))[2:], )) try: self.br = Bundle20Reader(bundlefile=self.bundle_path, cache_filename=self.cache_filename1, cache_size=self.cache1_size) except FileNotFoundError as e: # Empty repository! Still a successful visit targeting an # empty snapshot self.log.warn('%s is an empty repository!' % self.hgdir) self.empty_repository = True else: self.reduce_effort = set() if self.reduce_effort_flag: now = datetime.datetime.now(tz=datetime.timezone.utc) if (now - self.visit_date).days > 1: # Assuming that self.visit_date would be today for # a new visit, treat older visit dates as # indication of wanting to skip some processing # effort. for header, commit in self.br.yield_all_changesets(): ts = commit['time'].timestamp() if ts < self.visit_date.timestamp(): self.reduce_effort.add(header['node']) def has_contents(self): return not self.empty_repository def has_directories(self): return not self.empty_repository def has_revisions(self): return not self.empty_repository def has_releases(self): return not self.empty_repository def fetch_data(self): """Fetch the data from the data source.""" pass def get_contents(self): """Get the contents that need to be loaded.""" # NOTE: This method generates blobs twice to reduce memory usage # without generating disk writes. self.file_node_to_hash = {} hash_to_info = {} self.num_contents = 0 contents = {} missing_contents = set() for blob, node_info in self.br.yield_all_blobs(): self.num_contents += 1 file_name = node_info[0] header = node_info[2] length = len(blob) if header['linknode'] in self.reduce_effort: algorithms = [ALGO] else: algorithms = DEFAULT_ALGORITHMS h = MultiHash.from_data(blob, hash_names=algorithms) content = h.digest() content['length'] = length blob_hash = content[ALGO] self.file_node_to_hash[header['node']] = blob_hash if header['linknode'] in self.reduce_effort: continue hash_to_info[blob_hash] = node_info contents[blob_hash] = content missing_contents.add(blob_hash) if file_name == b'.hgtags': # https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model # overwrite until the last one self.tags = (t for t in blob.split(b'\n') if t != b'') if contents: missing_contents = set( self.storage.content_missing( list(contents.values()), key_hash=ALGO ) ) # Clusters needed blobs by file offset and then only fetches the # groups at the needed offsets. focs = {} # "file/offset/contents" for blob_hash in missing_contents: _, file_offset, header = hash_to_info[blob_hash] focs.setdefault(file_offset, {}) focs[file_offset][header['node']] = blob_hash hash_to_info = None for offset, node_hashes in sorted(focs.items()): for header, data, *_ in self.br.yield_group_objects( group_offset=offset ): node = header['node'] if node in node_hashes: blob, meta = self.br.extract_meta_from_blob(data) content = contents.pop(node_hashes[node], None) if content: content['data'] = blob yield content_for_storage( content, log=self.log, max_content_size=self.content_max_size_limit, origin_id=self.origin_id ) def load_directories(self): """This is where the work is done to convert manifest deltas from the repository bundle into SWH directories. """ self.mnode_to_tree_id = {} cache_hints = self.br.build_manifest_hints() def tree_size(t): return t.size() self.trees = SelectiveCache(cache_hints=cache_hints, size_function=tree_size, filename=self.cache_filename2, max_size=self.cache2_size) tree = SimpleTree() for header, added, removed in self.br.yield_all_manifest_deltas( cache_hints ): node = header['node'] basenode = header['basenode'] tree = self.trees.fetch(basenode) or tree # working tree for path in removed.keys(): tree = tree.remove_tree_node_for_path(path) for path, info in added.items(): file_node, is_symlink, perms_code = info tree = tree.add_blob( path, self.file_node_to_hash[file_node], is_symlink, perms_code ) if header['linknode'] in self.reduce_effort: self.trees.store(node, tree) else: new_dirs = [] self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs) self.trees.store(node, tree) yield header, tree, new_dirs def get_directories(self): """Compute directories to load """ dirs = {} self.num_directories = 0 for _, _, new_dirs in self.load_directories(): for d in new_dirs: self.num_directories += 1 dirs[d['id']] = d missing_dirs = list(dirs.keys()) if missing_dirs: missing_dirs = self.storage.directory_missing(missing_dirs) for _id in missing_dirs: yield dirs[_id] dirs = {} def get_revisions(self): """Compute revisions to load """ revisions = {} self.num_revisions = 0 for header, commit in self.br.yield_all_changesets(): if header['node'] in self.reduce_effort: continue self.num_revisions += 1 date_dict = identifiers.normalize_timestamp( int(commit['time'].timestamp()) ) author_dict = converters.parse_author(commit['user']) if commit['manifest'] == Bundle20Reader.NAUGHT_NODE: directory_id = SimpleTree().hash_changed() else: directory_id = self.mnode_to_tree_id[commit['manifest']] extra_meta = [] extra = commit.get('extra') if extra: for e in extra.split(b'\x00'): k, v = e.split(b':', 1) k = k.decode('utf-8') extra_meta.append([k, v]) revision = { 'author': author_dict, 'date': date_dict, 'committer': author_dict, 'committer_date': date_dict, 'type': 'hg', 'directory': directory_id, 'message': commit['message'], 'metadata': { 'node': hash_to_hex(header['node']), 'extra_headers': [ ['time_offset_seconds', str(commit['time_offset_seconds']).encode('utf-8')], ] + extra_meta }, 'synthetic': False, 'parents': [] } p1 = self.node_2_rev.get(header['p1']) p2 = self.node_2_rev.get(header['p2']) if p1: revision['parents'].append(p1) if p2: revision['parents'].append(p2) revision['id'] = hash_to_bytes( identifiers.revision_identifier(revision) ) self.node_2_rev[header['node']] = revision['id'] revisions[revision['id']] = revision # Converts heads to use swh ids self.heads = { - branch_name: self.node_2_rev[node_id] - for branch_name, node_id in self.heads.items() + branch_name: (pointer_nature, self.node_2_rev[node_id]) + for branch_name, (pointer_nature, node_id) in self.heads.items() } missing_revs = revisions.keys() if missing_revs: missing_revs = set( self.storage.revision_missing(list(missing_revs)) ) for r in missing_revs: yield revisions[r] self.mnode_to_tree_id = None def _read_tag(self, tag, split_byte=b' '): node, *name = tag.split(split_byte) name = split_byte.join(name) return node, name def get_releases(self): """Get the releases that need to be loaded.""" self.num_releases = 0 releases = {} missing_releases = [] for t in self.tags: self.num_releases += 1 node, name = self._read_tag(t) node = node.decode() node_bytes = hash_to_bytes(node) if not TAG_PATTERN.match(node): self.log.warn('Wrong pattern (%s) found in tags. Skipping' % ( node, )) continue if node_bytes not in self.node_2_rev: self.log.warn('No matching revision for tag %s ' '(hg changeset: %s). Skipping' % (name.decode(), node)) continue tgt_rev = self.node_2_rev[node_bytes] release = { 'name': name, 'target': tgt_rev, 'target_type': 'revision', 'message': None, 'metadata': None, 'synthetic': False, 'author': {'name': None, 'email': None, 'fullname': b''}, 'date': None } id_hash = hash_to_bytes( identifiers.release_identifier(release)) release['id'] = id_hash missing_releases.append(id_hash) releases[id_hash] = release self.releases[name] = id_hash if missing_releases: missing_releases = set( self.storage.release_missing(missing_releases)) for _id in missing_releases: yield releases[_id] def get_snapshot(self): """Get the snapshot that need to be loaded.""" branches = {} - for name, target in self.heads.items(): + for name, (pointer_nature, target) in self.heads.items(): branches[name] = {'target': target, 'target_type': 'revision'} + if pointer_nature == HEAD_POINTER_NAME: + branches[b'HEAD'] = {'target': name, 'target_type': 'alias'} for name, target in self.releases.items(): branches[name] = {'target': target, 'target_type': 'release'} snap = { 'id': None, 'branches': branches, } snap['id'] = identifiers.identifier_to_bytes( identifiers.snapshot_identifier(snap)) return snap def get_fetch_history_result(self): """Return the data to store in fetch_history.""" return { 'contents': self.num_contents, 'directories': self.num_directories, 'revisions': self.num_revisions, 'releases': self.num_releases, } class HgArchiveBundle20Loader(HgBundle20Loader): """Mercurial loader for repository wrapped within archives. """ def __init__(self): super().__init__( logging_class='swh.loader.mercurial.HgArchiveBundle20Loader') self.temp_dir = None def prepare(self, *, origin_url, archive_path, visit_date): self.temp_dir = tmp_extract(archive=archive_path, dir=self.temp_directory, prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix='.dump-%s' % os.getpid(), log=self.log, source=origin_url) repo_name = os.listdir(self.temp_dir)[0] directory = os.path.join(self.temp_dir, repo_name) super().prepare(origin_url=origin_url, visit_date=visit_date, directory=directory) def cleanup(self): if self.temp_dir and os.path.exists(self.temp_dir): rmtree(self.temp_dir) super().cleanup() diff --git a/swh/loader/mercurial/bundle20_loader_verifier.py b/swh/loader/mercurial/loader_verifier.py similarity index 60% rename from swh/loader/mercurial/bundle20_loader_verifier.py rename to swh/loader/mercurial/loader_verifier.py index 6321b15..e29f52f 100644 --- a/swh/loader/mercurial/bundle20_loader_verifier.py +++ b/swh/loader/mercurial/loader_verifier.py @@ -1,255 +1,248 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import click import code import datetime import hglib +import logging import os import random import sys import time from binascii import hexlify, unhexlify from swh.model.hashutil import MultiHash from .loader import HgBundle20Loader from .converters import PRIMARY_ALGO as ALGO from .objects import SimpleTree class HgLoaderValidater(HgBundle20Loader): def generate_all_blobs(self, validate=True, frequency=1): - print('GENERATING BLOBS') + logging.debug('GENERATING BLOBS') i = 0 start = time.time() u = set() for blob, node_info in self.br.yield_all_blobs(): filename = node_info[0] header = node_info[2] i += 1 hashes = MultiHash.from_data(blob, hash_names=set([ALGO])).digest() bhash = hashes[ALGO] self.file_node_to_hash[header['node']] = bhash u.update([bhash]) if validate: if random.random() < frequency: self.validate_blob(filename, header, blob) if i % 10000 == 0: - print(i) + logging.debug(i) - print('') - print('FOUND', i, 'BLOBS') - print('FOUND', len(u), 'UNIQUE BLOBS') - print('ELAPSED', time.time()-start) + logging.debug('\nFOUND %s BLOBS' % i) + logging.debug('FOUND: %s UNIQUE BLOBS' % len(u)) + logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_blob(self, filename, header, blob): - if not self.hg: - self.hg = hglib.open(self.hgdir) - - data = bytes(blob) - - filepath = os.path.join(self.hg.root(), bytes(filename)) - linknode = hexlify(header['linknode']) - cat_contents = self.hg.cat([filepath], rev=linknode) - - if cat_contents != data: - print('INTERNAL ERROR ERROR ERROR ERROR') - print(filename) - print(header) - print('-----') - print(cat_contents) - print('---- vs ----') - print(data) - code.interact(local=dict(globals(), **locals())) - quit() - else: - print('v', end='') + if not self.hg: + self.hg = hglib.open(self.hgdir) + + data = bytes(blob) + + filepath = os.path.join(self.hg.root(), bytes(filename)) + linknode = hexlify(header['linknode']) + cat_contents = self.hg.cat([filepath], rev=linknode) + + if cat_contents != data: + logging.debug('INTERNAL ERROR ERROR ERROR ERROR') + logging.debug(filename) + logging.debug(header) + logging.debug('-----') + logging.debug(cat_contents) + logging.debug('---- vs ----') + logging.debug(data) + code.interact(local=dict(globals(), **locals())) + quit() + else: + logging.debug('v', end='') def generate_all_trees(self, validate=True, frequency=1): - print('GENERATING MANIFEST TREES') + logging.debug('GENERATING MANIFEST TREES') c = 0 n = 0 u = set() start = time.time() validated = 0 for header, tree, new_dirs in self.load_directories(): if validate and (c >= validated) and (random.random() < frequency): self.validate_tree(tree, header, c) for d in new_dirs: u.add(d['id']) c += 1 n += len(new_dirs) - print('.', end='') + logging.debug('.', end='') if c % 20 == 0: sys.stdout.flush() if c % 10000 == 0: - print(c) + logging.debug(c) - print('') - print('FOUND', c, 'COMMIT MANIFESTS') - print('FOUND', n, 'NEW DIRS') - print('FOUND', len(u), 'UNIQUE DIRS') - print('ELAPSED', time.time()-start) + logging.debug('\nFOUND: %s COMMIT MANIFESTS' % c) + logging.debug('FOUND: %s NEW DIRS' % n) + logging.debug('FOUND: %s UNIQUE DIRS' % len(u)) + logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_tree(self, tree, header, i): if not self.hg: self.hg = hglib.open(self.hgdir) commit_id = header['linknode'] if len(commit_id) == 20: commit_id = hexlify(commit_id) base_tree = SimpleTree() base_files = list(self.hg.manifest(rev=commit_id)) bfiles = sorted([f[4] for f in base_files]) for p in base_files: base_tree.add_blob( p[4], self.file_node_to_hash[unhexlify(p[0])], p[3], p[1] ) base_tree.hash_changed() files = sorted(list(tree.flatten().keys())) if tree != base_tree: - print('validating rev:', i, 'commit:', commit_id) - print('validating files:', len(files), len(base_files)) - print(' INVALID TREE') + logging.debug('validating rev: %s commit: %s' % (i, commit_id)) + logging.debug('validating files: %s %s INVALID TREE' % ( + len(files), len(base_files))) def so1(a): keys = [k['name'] for k in a['entries']] return b''.join(sorted(keys)) tree_dirs = [d for d in tree.yield_swh_directories()] base_dirs = [d for d in base_tree.yield_swh_directories()] tree_dirs.sort(key=so1) base_dirs.sort(key=so1) # for i in range(len(tree_dirs)): # if tree_dirs[i] != base_dirs[i]: - # print(i) + # logging.debug(i) # code.interact(local=dict(globals(), **locals())) - print('Program will quit after your next Ctrl-D') + logging.debug('Program will quit after your next Ctrl-D') code.interact(local=dict(globals(), **locals())) quit() else: - print('v', end='') + logging.debug('v') def generate_all_commits(self, validate=True, frequency=1): i = 0 start = time.time() for rev in self.get_revisions(): - print('.', end='') + logging.debug('.', end='') i += 1 if i % 20 == 0: sys.stdout.flush() - print('') - print('FOUND', i, 'COMMITS') - print('ELAPSED', time.time()-start) + logging.debug('') + logging.debug('\nFOUND: %s COMMITS' % i) + logging.debug('ELAPSED: %s' % (time.time()-start)) def runtest(self, hgdir, validate_blobs=False, validate_trees=False, frequency=1.0, test_iterative=False): - """ - HgLoaderValidater().runtest('/home/avi/SWH/mozilla-unified') + """HgLoaderValidater().runtest('/home/avi/SWH/mozilla-unified') + """ self.origin_id = 'test' dt = datetime.datetime.now(tz=datetime.timezone.utc) if test_iterative: dt = dt - datetime.timedelta(10) hgrepo = None if (hgdir.lower().startswith('http:') or hgdir.lower().startswith('https:')): hgrepo, hgdir = hgdir, hgrepo self.hgdir = hgdir try: - print('preparing') - self.prepare(hgrepo, dt, hgdir) + logging.debug('preparing') + self.prepare(origin_url=hgrepo, visit_date=dt, directory=hgdir) self.file_node_to_hash = {} # self.generate_all_blobs(validate=validate_blobs, # frequency=frequency) # self.generate_all_trees(validate=validate_trees, frequency=frequency) # self.generate_all_commits() - print('getting contents') + logging.debug('getting contents') cs = 0 for c in self.get_contents(): cs += 1 pass - print('getting directories') + logging.debug('getting directories') ds = 0 for d in self.get_directories(): ds += 1 pass revs = 0 - print('getting revisions') + logging.debug('getting revisions') for rev in self.get_revisions(): revs += 1 pass - print('getting releases') + logging.debug('getting releases') rels = 0 for rel in self.get_releases(): rels += 1 - print(rel) + logging.debug(rel) self.visit = 'foo' - print('getting snapshot') + logging.debug('getting snapshot') o = self.get_snapshot() - print(o['branches'].keys()) + logging.debug('Snapshot: %s' % o) finally: self.cleanup() - print('final count: ', - 'cs', cs, 'ds', ds, 'revs', revs, 'rels', rels) - - -def main(): - if len(sys.argv) > 1: - test_repo = sys.argv[1] - else: - print('Please pass in the path to an HG repository.') - quit() - - while test_repo[-1] == '/': - test_repo = test_repo[:-1] - - if len(sys.argv) > 2: - validate_frequency = float(sys.argv[2]) - else: - validate_frequency = 0.001 - - if len(sys.argv) > 3: - test_iterative = True - else: - test_iterative = False - - HgLoaderValidater().runtest(test_repo, True, True, validate_frequency, - test_iterative) + logging.info('final count: cs %s ds %s revs %s rels %s' % ( + cs, ds, revs, rels)) + + +@click.command() +@click.option('--verbose', is_flag=True, default=False) +@click.option('--validate-frequency', default=0.001, type=click.FLOAT) +@click.option('--test-iterative', default=False, type=click.BOOL) +@click.argument('repository-url', required=1) +def main(verbose, validate_frequency, test_iterative, repository_url): + logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) + while repository_url[-1] == '/': + repository_url = repository_url[:-1] + + HgLoaderValidater().runtest( + repository_url, + validate_blobs=True, validate_trees=True, + frequency=validate_frequency, + test_iterative=test_iterative) if __name__ == '__main__': main() diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py deleted file mode 100644 index 90740fe..0000000 --- a/swh/loader/mercurial/slow_loader.py +++ /dev/null @@ -1,471 +0,0 @@ -# Copyright (C) 2017-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -# WARNING WARNING WARNING WARNING -# hglib is too slow to be super useful. Unfortunately it's also the only -# python3 library for mercurial as of this writing. - Avi - -import datetime -import hglib -import os - -from swh.model import identifiers -from swh.model.hashutil import MultiHash, DEFAULT_ALGORITHMS, hash_to_hex -from swh.loader.core.loader import SWHStatelessLoader - -from .converters import parse_author, PRIMARY_ALGO as ALGO - - -OS_PATH_SEP = os.path.sep.encode('utf-8') - - -def data_to_content_id(data): - size = len(data) - ret = { - 'length': size, - } - ret.update(identifiers.content_identifier({'data': data})) - return ret - - -def blob_to_content_dict(data, existing_hashes=None, max_size=None, - logger=None): - """Convert blob data to a SWH Content. If the blob already - has hashes computed, don't recompute them. - TODO: This should be unified with similar functions in other places. - - args: - existing_hashes: dict of hash algorithm:value pairs - max_size: size over which blobs should be rejected - logger: logging class instance - returns: - A Software Heritage "content". - """ - existing_hashes = existing_hashes or {} - - size = len(data) - content = { - 'length': size, - } - content.update(existing_hashes) - - hash_types = list(existing_hashes.keys()) - hashes_to_do = DEFAULT_ALGORITHMS.difference(hash_types) - hashes = MultiHash.from_data(data, hash_names=hashes_to_do).digest() - content.update(hashes) - - if max_size and (size > max_size): - content.update({ - 'status': 'absent', - 'reason': 'Content too large', - }) - if logger: - id_hash = hash_to_hex(content[ALGO]) - logger.info( - 'Skipping content %s, too large (%s > %s)' - % (id_hash, size, max_size), - extra={ - 'swh_type': 'loader_content_skip', - 'swh_id': id_hash, - 'swh_size': size - } - ) - else: - content.update({'data': data, 'status': 'visible'}) - - return content - - -class SimpleBlob: - """ Stores basic metadata for a blob object. - """ - kind = 'file' - - def __init__(self, file_hash, file_mode): - self.hash = file_hash - if not isinstance(file_mode, int): - self.mode = 0o100000 + int(file_mode, 8) - else: - self.mode = file_mode - - -class SimpleTree(dict): - """ Stores metadata for a nested 'tree'-like object. - """ - kind = 'dir' - mode = 0o040000 - - def add_tree_node_for_path(self, path): - """Deeply nests SimpleTrees according to a directory path and returns - a cursor to the deepest one""" - node = self - for d in path.split(OS_PATH_SEP): - node = node.setdefault(d, SimpleTree()) - return node - - def remove_tree_node_for_path(self, path): - """Deletes a SimpleBlob from inside nested SimpleTrees according to - the given file path""" - first, sep, rest = path.partition(OS_PATH_SEP) - if rest: - self[first].remove_tree_node_for_path(rest) - if not self.get(first): - del self[first] - else: - del self[first] - - def add_blob(self, file_path, file_hash, file_mode): - """Deeply nests a SimpleBlob inside nested SimpleTrees according to - the given file path""" - fdir = os.path.dirname(file_path) - fbase = os.path.basename(file_path) - if fdir: - node = self.add_tree_node_for_path(fdir) - else: - node = self - node[fbase] = SimpleBlob(file_hash, file_mode) - - -class HgLoader(SWHStatelessLoader): - """Load a mercurial repository from a directory. - - """ - CONFIG_BASE_FILENAME = 'loader/hg' - - def __init__(self, logging_class='swh.loader.mercurial.HgLoader'): - super().__init__(logging_class=logging_class) - - def prepare_origin_visit(self, origin_url, directory, visit_date): - self.origin = { - 'type': 'hg', - 'url': origin_url - } - self.visit_date = visit_date - - def prepare(self, origin_url, directory, visit_date): - """see base.BaseLoader.prepare""" - self.repo = hglib.open(directory) - self.node_to_blob_hash = {} - self.blob_hash_to_file_rev = {} - self.commit_trees = {} - self.unique_trees = {} - self.revisions = {} - - def fetch_data(self): - """Fetch the data from the data source""" - pass - - def has_contents(self): - """Checks whether we need to load contents""" - # if we have any revisions, then obviously we have contents. - return self.has_revisions() - - def iter_changelog(self): - """Iterate over the repository log""" - yield from self.repo.log('0:tip', removed=True) - - def get_node_file_if_new(self, f, rev, node_hash): - """Load a blob from disk""" - # Fast if the node hash is already cached. Somehow this shortcuts a - # meaningful but not huge percentage of the loads for a repository. - if node_hash not in self.node_to_blob_hash: - file_path = os.path.join(self.repo.root(), f) - - data = self.repo.cat([file_path], rev) - blob_hash = identifiers.content_identifier( - {'data': data} - )[ALGO] - - self.node_to_blob_hash[node_hash] = blob_hash - - if blob_hash not in self.blob_hash_to_file_rev: - # new blob - self.blob_hash_to_file_rev[blob_hash] = (file_path, rev) - return blob_hash, data - - return self.node_to_blob_hash[node_hash], None - - def get_content_ids(self): - """Get all the contents, but trim away the actual data""" - self.node_to_blob_hash = {} - self.blob_hash_to_file_rev = {} - self.num_contents = 0 - - for li in self.iter_changelog(): - c = self.repo[li] - rev = c.rev() - manifest = c.manifest() - - for f in c.added() + c.modified(): - node_hash = manifest[f] - blob_hash, data = self.get_node_file_if_new(f, rev, node_hash) - if data is not None: # new blob - self.num_contents += 1 - yield data_to_content_id(data) - - def get_contents(self): - """Get the contents that need to be loaded""" - # This method unfortunately loads and hashes the blobs twice. - - max_content_size = self.config['content_size_limit'] - missing_contents = set( - self.storage.content_missing( - self.get_content_ids(), - ALGO - ) - ) - - for oid in missing_contents: - file_path, rev = self.blob_hash_to_file_rev[oid] - data = self.repo.cat([file_path], rev) - yield blob_to_content_dict( - data, max_size=max_content_size, logger=self.log - ) - - def has_directories(self): - """Checks whether we need to load directories""" - # if we have any revs, we must also have dirs - return self.has_revisions() - - def get_directories(self): - """Get the directories that need to be loaded""" - missing_dirs = set(self.storage.directory_missing( - sorted(self.unique_trees.keys()) - )) - - for dir_hash in missing_dirs: - yield self.unique_trees[dir_hash] - - def has_revisions(self): - """Checks whether we need to load revisions""" - self.num_revisions = int(self.repo.tip()[0]) + 1 - return self.num_revisions > 0 - - def update_tree_from_rev(self, tree, rev, only_these_files=None): - """Iterates over changes in a revision and adds corresponding - SimpleBlobs to a SimpleTree""" - if rev >= 0: - manifest = {k[4]: k for k in self.repo.manifest(rev=rev)} - loop_keys = only_these_files or manifest.keys() - for f in loop_keys: - node_hash = manifest[f][0] - file_mode = manifest[f][1] - file_hash, _ = self.get_node_file_if_new(f, rev, node_hash) - tree.add_blob(f, file_hash, file_mode) - - return tree - - def reconstruct_tree(self, directory): - """Converts a flat directory into nested SimpleTrees.""" - # This method exists because the code was already written to use - # SimpleTree before then reducing memory use and converting to the - # canonical format. A refactor using lookups instead of nesting could - # obviate the need. - new_tree = SimpleTree() - for entry in directory['entries']: - tgt = entry['target'] - perms = entry['perms'] - name = entry['name'] - if tgt in self.unique_trees: # subtree - new_tree[name] = self.reconstruct_tree(self.unique_trees[tgt]) - else: # blob - new_tree[name] = SimpleBlob(tgt, perms) - new_tree.hash = directory['id'] - return new_tree - - def collapse_tree(self, tree): - """Converts nested SimpleTrees into multiple flat directories.""" - # This method exists because the code was already written to use - # SimpleTree before then reducing memory use and converting to the - # canonical format. A refactor using lookups instead of nesting could - # obviate the need. - directory = { - 'entries': [ - { - 'name': k, - 'perms': v.mode, - 'type': v.kind, - 'target': (isinstance(v, SimpleBlob) - and v.hash - or self.collapse_tree(v)) - } - for k, v in tree.items() - ] - } - tree.hash = identifiers.directory_identifier(directory) - directory['id'] = tree.hash - self.unique_trees[tree.hash] = directory - return tree.hash - - def get_revision_ids(self): - """Get the revisions that need to be loaded""" - self.unique_trees = {} - commit_tree = None - for li in self.iter_changelog(): - c = self.repo[li[1]] - rev = c.rev() - - # start from the parent state - p1 = c.p1().rev() - if p1 in self.commit_trees: - if p1 != rev-1: - # Most of the time, a revision will inherit from the - # previous one. In those cases we can reuse commit_tree, - # otherwise build a new one here. - parent_tree = self.unique_trees[self.commit_trees[p1]] - commit_tree = self.reconstruct_tree(parent_tree) - else: - commit_tree = self.update_tree_from_rev(SimpleTree(), p1) - - # remove whatever is removed - for f in c.removed(): - commit_tree.remove_tree_node_for_path(f) - - # update whatever is updated - self.update_tree_from_rev(commit_tree, rev, c.added()+c.modified()) - - self.commit_trees[rev] = self.collapse_tree(commit_tree) - - date_dict = identifiers.normalize_timestamp( - int(c.date().timestamp()) - ) - author_dict = parse_author(c.author()) - - parents = [] - for p in c.parents(): - if p.rev() >= 0: - parents.append(self.revisions[p.node()]['id']) - - phase = c.phase() # bytes - rev = str(rev).encode('utf-8') - hidden = str(c.hidden()).encode('utf-8') - hg_headers = [['phase', phase], ['rev', rev], ['hidden', hidden]] - - revision = { - 'author': author_dict, - 'date': date_dict, - 'committer': author_dict, - 'committer_date': date_dict, - 'type': 'hg', - 'directory': identifiers.identifier_to_bytes(commit_tree.hash), - 'message': c.description(), - 'metadata': { - 'extra_headers': hg_headers - }, - 'synthetic': False, - 'parents': parents, - } - revision['id'] = identifiers.identifier_to_bytes( - identifiers.revision_identifier(revision)) - self.revisions[c.node()] = revision - for n, r in self.revisions.items(): - yield {'node': n, 'id': r['id']} - - def get_revisions(self): - """Get the revision identifiers from the repository""" - revs = { - r['id']: r['node'] - for r in self.get_revision_ids() - } - missing_revs = set(self.storage.revision_missing(revs.keys())) - for r in missing_revs: - yield self.revisions[revs[r]] - - def has_releases(self): - """Checks whether we need to load releases""" - self.num_releases = len([t for t in self.repo.tags() if not t[3]]) - return self.num_releases > 0 - - def get_releases(self): - """Get the releases that need to be loaded""" - releases = {} - for t in self.repo.tags(): - islocal = t[3] - name = t[0] - if (name != b'tip' and not islocal): - short_hash = t[2] - node_id = self.repo[short_hash].node() - target = self.revisions[node_id]['id'] - release = { - 'name': name, - 'target': target, - 'target_type': 'revision', - 'message': None, - 'metadata': None, - 'synthetic': False, - 'author': {'name': None, 'email': None, 'fullname': b''}, - 'date': None - } - id_bytes = identifiers.identifier_to_bytes( - identifiers.release_identifier(release)) - release['id'] = id_bytes - releases[id_bytes] = release - - missing_rels = set(self.storage.release_missing( - sorted(releases.keys()) - )) - - yield from (releases[r] for r in missing_rels) - - def get_snapshot(self): - """Get the snapshot that need to be loaded""" - self.num_snapshot = 1 - - def _get_branches(repo=self.repo): - for t in ( - repo.tags() + repo.branches() + repo.bookmarks()[0] - ): - name = t[0] - short_hash = t[2] - node = self.repo[short_hash].node() - yield name, { - 'target': self.revisions[node]['id'], - 'target_type': 'revision' - } - - snap = { - 'branches': { - name: branch - for name, branch in _get_branches() - } - } - snap['id'] = identifiers.identifier_to_bytes( - identifiers.snapshot_identifier(snap)) - return snap - - def get_fetch_history_result(self): - """Return the data to store in fetch_history for the current loader""" - return { - 'contents': self.num_contents, - 'directories': len(self.unique_trees), - 'revisions': self.num_revisions, - 'releases': self.num_releases, - 'snapshot': self.num_snapshot, - } - - def save_data(self): - """We already have the data locally, no need to save it""" - pass - - def eventful(self): - """Whether the load was eventful""" - return True - - -if __name__ == '__main__': - import logging - import sys - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(process)d %(message)s' - ) - loader = HgLoader() - - origin_url = sys.argv[1] - directory = sys.argv[2] - visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - - print(loader.load(origin_url, directory, visit_date)) diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py index 8db2e2e..38c2d02 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,251 +1,259 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from nose.tools import istest from swh.loader.core.tests import BaseLoaderTest, LoaderNoStorage from swh.loader.mercurial.loader import HgBundle20Loader class MockStorage: """A mixin inhibited storage overriding *_missing methods. Those are called from within the mercurial loader. Rationale: Need to take control of the current behavior prior to refactor it. The end game is to remove this when we will have tests ok. """ def content_missing(self, contents, key_hash='sha1'): return [c[key_hash] for c in contents] def directory_missing(self, directories): return directories def release_missing(self, releases): return releases def revision_missing(self, revisions): return revisions class BaseHgLoaderTest(BaseLoaderTest): """Mixin base loader test to prepare the mercurial repository to uncompress, load and test the results. This sets up """ def setUp(self, archive_name='the-sandbox.tgz', filename='the-sandbox'): super().setUp(archive_name=archive_name, filename=filename, prefix_tmp_folder_name='swh.loader.mercurial.', start_path=os.path.dirname(__file__)) class HgLoaderNoStorage(LoaderNoStorage, HgBundle20Loader): """The mercurial loader to test. Its behavior has been changed to: - not use any persistence (no storage, or for now a passthrough storage with no filtering) - not use the default configuration loading At the end of the tests, you can make sure you have the rights objects. """ ADDITIONAL_CONFIG = { 'reduce_effort': ('bool', False), # FIXME: This needs to be # checked (in production # for now, this is not # deployed.) 'temp_directory': ('str', '/tmp/swh.loader.mercurial'), 'cache1_size': ('int', 800*1024*1024), 'cache2_size': ('int', 800*1024*1024), 'bundle_filename': ('str', 'HG20_none_bundle'), } def __init__(self): super().__init__() self.origin_id = 1 self.visit = 1 self.storage = MockStorage() class LoaderITest1(BaseHgLoaderTest): """Load a mercurial repository without release """ def setUp(self): super().setUp() self.loader = HgLoaderNoStorage() @istest def load(self): """Load a repository with multiple branches results in 1 snapshot """ # when self.loader.load( origin_url=self.repo_url, visit_date='2016-05-03 15:16:32+00', directory=self.destination_path) # then self.assertCountContents(2) self.assertCountDirectories(3) self.assertCountReleases(0) self.assertCountRevisions(58) tip_revision_develop = 'a9c4534552df370f43f0ef97146f393ef2f2a08c' tip_revision_default = '70e750bb046101fdced06f428e73fee471509c56' # same from rev 3 onward directory_hash = '180bd57623a7c2c47a8c43514a5f4d903503d0aa' # cf. test_loader.org for explaining from where those hashes # come from expected_revisions = { # revision hash | directory hash # noqa 'aafb69fd7496ca617f741d38c40808ff2382aabe': 'e2e117569b086ceabeeedee4acd95f35298d4553', # noqa 'b6932cb7f59e746899e4804f3d496126d1343615': '9cd8160c67ac4b0bc97e2e2cd918a580425167d3', # noqa tip_revision_default: directory_hash, '18012a93d5aadc331c468dac84b524430f4abc19': directory_hash, 'bec4c0a31b0b2502f44f34aeb9827cd090cca621': directory_hash, '5f4eba626c3f826820c4475d2d81410759ec911b': directory_hash, 'dcba06661c607fe55ec67b1712d153b69f65e38c': directory_hash, 'c77e776d22548d47a8d96463a3556172776cd59b': directory_hash, '61d762d65afb3150e2653d6735068241779c1fcf': directory_hash, '40def747398c76ceec1bd248e3a6cb2a52e22dc5': directory_hash, '6910964416438ca8d1698f6295871d727c4d4851': directory_hash, 'be44d5e6cc66580f59c108f8bff5911ee91a22e4': directory_hash, 'c4a95d5097519dedac437fddf0ef775136081241': directory_hash, '32eb0354a660128e205bf7c3a84b46040ef70d92': directory_hash, 'dafa445964230e808148db043c126063ea1dc9b6': directory_hash, 'a41e2a548ba51ee47f22baad8e88994853d3e2f5': directory_hash, 'dc3e3ab7fe257d04769528e5e17ad9f1acb44659': directory_hash, 'd2164061453ecb03d4347a05a77db83f706b8e15': directory_hash, '34192ceef239b8b72141efcc58b1d7f1676a18c9': directory_hash, '2652147529269778757d96e09aaf081695548218': directory_hash, '4d640e8064fe69b4c851dfd43915c431e80c7497': directory_hash, 'c313df50bfcaa773dcbe038d00f8bd770ba997f8': directory_hash, '769db00b34b9e085dc699c8f1550c95793d0e904': directory_hash, '2973e5dc9568ac491b198f6b7f10c44ddc04e0a3': directory_hash, 'be34b8c7857a6c04e41cc06b26338d8e59cb2601': directory_hash, '24f45e41637240b7f9e16d2791b5eacb4a406d0f': directory_hash, '62ff4741eac1821190f6c2cdab7c8a9d7db64ad0': directory_hash, 'c346f6ff7f42f2a8ff867f92ab83a6721057d86c': directory_hash, 'f2afbb94b319ef5d60823859875284afb95dcc18': directory_hash, '4e2dc6d6073f0b6d348f84ded52f9143b10344b9': directory_hash, '31cd7c5f669868651c57e3a2ba25ac45f76fa5cf': directory_hash, '25f5b27dfa5ed15d336188ef46bef743d88327d4': directory_hash, '88b80615ed8561be74a700b92883ec0374ddacb0': directory_hash, '5ee9ea92ed8cc1737b7670e39dab6081c64f2598': directory_hash, 'dcddcc32740d2de0e1403e21a5c4ed837b352992': directory_hash, '74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b': directory_hash, 'cb36b894129ca7910bb81c457c72d69d5ff111bc': directory_hash, 'caef0cb155eb6c55215aa59aabe04a9c702bbe6a': directory_hash, '5017ce0b285351da09a2029ea2cf544f79b593c7': directory_hash, '17a62618eb6e91a1d5d8e1246ccedae020d3b222': directory_hash, 'a1f000fb8216838aa2a120738cc6c7fef2d1b4d8': directory_hash, '9f82d95bd3edfb7f18b1a21d6171170395ea44ce': directory_hash, 'a701d39a17a9f48c61a06eee08bd9ac0b8e3838b': directory_hash, '4ef794980f820d44be94b2f0d53eb34d4241638c': directory_hash, 'ddecbc16f4c916c39eacfcb2302e15a9e70a231e': directory_hash, '3565e7d385af0745ec208d719e469c2f58be8e94': directory_hash, 'c875bad563a73a25c5f3379828b161b1441a7c5d': directory_hash, '94be9abcf9558213ff301af0ecd8223451ce991d': directory_hash, '1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0': directory_hash, '553b09724bd30d9691b290e157b27a73e2d3e537': directory_hash, '9e912851eb64e3a1e08fbb587de7a4c897ce5a0a': directory_hash, '9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf': directory_hash, 'db9e625ba90056304897a94c92e5d27bc60f112d': directory_hash, '2d4a801c9a9645fcd3a9f4c06418d8393206b1f3': directory_hash, 'e874cd5967efb1f45282e9f5ce87cc68a898a6d0': directory_hash, 'e326a7bbb5bc00f1d8cacd6108869dedef15569c': directory_hash, '3ed4b85d30401fe32ae3b1d650f215a588293a9e': directory_hash, tip_revision_develop: directory_hash, } self.assertRevisionsOk(expected_revisions) self.assertCountSnapshots(1) expected_snapshot = { - 'id': '05cad59e8980069d9fe2324d406cf226c0021e1c', + 'id': '3b8fe58e467deb7597b12a5fd3b2c096b8c02028', 'branches': { 'develop': { 'target': tip_revision_develop, 'target_type': 'revision' }, 'default': { 'target': tip_revision_default, 'target_type': 'revision' }, + 'HEAD': { + 'target': 'develop', + 'target_type': 'alias', + } } } self.assertSnapshotOk(expected_snapshot) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') class LoaderITest2(BaseHgLoaderTest): """Load a mercurial repository with release """ def setUp(self): super().setUp(archive_name='hello.tgz', filename='hello') self.loader = HgLoaderNoStorage() @istest def load(self): """Load a repository with tags results in 1 snapshot """ # when self.loader.load( origin_url=self.repo_url, visit_date='2016-05-03 15:16:32+00', directory=self.destination_path) # then self.assertCountContents(3) self.assertCountDirectories(3) self.assertCountReleases(1) self.assertCountRevisions(3) tip_release = '515c4d72e089404356d0f4b39d60f948b8999140' self.assertReleasesOk([tip_release]) tip_revision_default = 'c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27' # cf. test_loader.org for explaining from where those hashes # come from expected_revisions = { # revision hash | directory hash # noqa '93b48d515580522a05f389bec93227fc8e43d940': '43d727f2f3f2f7cb3b098ddad1d7038464a4cee2', # noqa '8dd3db5d5519e4947f035d141581d304565372d2': 'b3f85f210ff86d334575f64cb01c5bf49895b63e', # noqa tip_revision_default: '8f2be433c945384c85920a8e60f2a68d2c0f20fb', } self.assertRevisionsOk(expected_revisions) self.assertCountSnapshots(1) expected_snapshot = { - 'id': 'fa537f8e0cbdb8a54e29533302ed6fcbee28cb7b', + 'id': 'd35668e02e2ba4321dc951cd308cf883786f918a', 'branches': { 'default': { 'target': tip_revision_default, 'target_type': 'revision' }, '0.1': { 'target': tip_release, 'target_type': 'release' + }, + 'HEAD': { + 'target': 'default', + 'target_type': 'alias', } } } self.assertSnapshotOk(expected_snapshot) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') diff --git a/version.txt b/version.txt index 3d755c9..df2f276 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.13-0-g7e8386d \ No newline at end of file +v0.0.14-0-gef75fd6 \ No newline at end of file