diff --git a/swh/loader/mercurial/tests/common.py b/swh/loader/mercurial/tests/common.py new file mode 100644 index 0000000..31a71c4 --- /dev/null +++ b/swh/loader/mercurial/tests/common.py @@ -0,0 +1,52 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.loader.mercurial.loader import HgBundle20Loader + +_LOADER_TEST_CONFIG = { + 'bundle_filename': 'HG20_none_bundle', + 'cache1_size': 838860800, + 'cache2_size': 838860800, + 'content_packet_size': 100000, + 'content_packet_size_bytes': 1073741824, + 'content_size_limit': 104857600, + 'directory_packet_size': 25000, + 'log_db': 'dbname=softwareheritage-log', + 'occurrence_packet_size': 100000, + 'reduce_effort': False, + 'release_packet_size': 100000, + 'revision_packet_size': 100000, + 'save_data': False, + 'save_data_path': '', + 'send_contents': True, + 'send_directories': True, + 'send_occurrences': True, + 'send_releases': True, + 'send_revisions': True, + 'send_snapshot': True, + 'storage': {'args': {}, 'cls': 'memory'}, + 'temp_directory': '/tmp/swh.loader.mercurial' +} + + +class HgLoaderMemoryStorage(HgBundle20Loader): + """The mercurial loader to test. + + Its behavior has been changed to: + - not use any persistence (no storage, or for now a passthrough + storage with no filtering) + - not use the default configuration loading + + At the end of the tests, you can make sure you have the rights + objects. + + """ + def __init__(self): + super().__init__() + self.origin_id = 1 + self.visit = 1 + + def parse_config_file(self, *args, **kwargs): + return _LOADER_TEST_CONFIG diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py index 418846e..eff3260 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,247 +1,201 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.loader.core.tests import BaseLoaderTest -from swh.loader.mercurial.loader import HgBundle20Loader - -_LOADER_TEST_CONFIG = { - 'bundle_filename': 'HG20_none_bundle', - 'cache1_size': 838860800, - 'cache2_size': 838860800, - 'content_packet_size': 100000, - 'content_packet_size_bytes': 1073741824, - 'content_size_limit': 104857600, - 'directory_packet_size': 25000, - 'log_db': 'dbname=softwareheritage-log', - 'occurrence_packet_size': 100000, - 'reduce_effort': False, - 'release_packet_size': 100000, - 'revision_packet_size': 100000, - 'save_data': False, - 'save_data_path': '', - 'send_contents': True, - 'send_directories': True, - 'send_occurrences': True, - 'send_releases': True, - 'send_revisions': True, - 'send_snapshot': True, - 'storage': {'args': {}, 'cls': 'memory'}, - 'temp_directory': '/tmp/swh.loader.mercurial' -} - - -class HgLoaderMemoryStorage(HgBundle20Loader): - """The mercurial loader to test. - - Its behavior has been changed to: - - not use any persistence (no storage, or for now a passthrough - storage with no filtering) - - not use the default configuration loading - - At the end of the tests, you can make sure you have the rights - objects. - - """ - def __init__(self): - super().__init__() - self.origin_id = 1 - self.visit = 1 - - def parse_config_file(self, *args, **kwargs): - return _LOADER_TEST_CONFIG +from .common import HgLoaderMemoryStorage class BaseHgLoaderTest(BaseLoaderTest): """Mixin base loader test to prepare the mercurial repository to uncompress, load and test the results. This sets up """ def setUp(self, archive_name='the-sandbox.tgz', filename='the-sandbox'): super().setUp(archive_name=archive_name, filename=filename, prefix_tmp_folder_name='swh.loader.mercurial.', start_path=os.path.dirname(__file__)) self.loader = HgLoaderMemoryStorage() self.storage = self.loader.storage class LoaderITest1(BaseHgLoaderTest): """Load a mercurial repository without release """ def test_load(self): """Load a repository with multiple branches results in 1 snapshot """ # when self.loader.load( origin_url=self.repo_url, visit_date='2016-05-03 15:16:32+00', directory=self.destination_path) # then self.assertCountContents(2) self.assertCountDirectories(3) self.assertCountReleases(0) self.assertCountRevisions(58) tip_revision_develop = 'a9c4534552df370f43f0ef97146f393ef2f2a08c' tip_revision_default = '70e750bb046101fdced06f428e73fee471509c56' # same from rev 3 onward directory_hash = '180bd57623a7c2c47a8c43514a5f4d903503d0aa' # cf. test_loader.org for explaining from where those hashes # come from expected_revisions = { # revision hash | directory hash # noqa 'aafb69fd7496ca617f741d38c40808ff2382aabe': 'e2e117569b086ceabeeedee4acd95f35298d4553', # noqa 'b6932cb7f59e746899e4804f3d496126d1343615': '9cd8160c67ac4b0bc97e2e2cd918a580425167d3', # noqa tip_revision_default: directory_hash, '18012a93d5aadc331c468dac84b524430f4abc19': directory_hash, 'bec4c0a31b0b2502f44f34aeb9827cd090cca621': directory_hash, '5f4eba626c3f826820c4475d2d81410759ec911b': directory_hash, 'dcba06661c607fe55ec67b1712d153b69f65e38c': directory_hash, 'c77e776d22548d47a8d96463a3556172776cd59b': directory_hash, '61d762d65afb3150e2653d6735068241779c1fcf': directory_hash, '40def747398c76ceec1bd248e3a6cb2a52e22dc5': directory_hash, '6910964416438ca8d1698f6295871d727c4d4851': directory_hash, 'be44d5e6cc66580f59c108f8bff5911ee91a22e4': directory_hash, 'c4a95d5097519dedac437fddf0ef775136081241': directory_hash, '32eb0354a660128e205bf7c3a84b46040ef70d92': directory_hash, 'dafa445964230e808148db043c126063ea1dc9b6': directory_hash, 'a41e2a548ba51ee47f22baad8e88994853d3e2f5': directory_hash, 'dc3e3ab7fe257d04769528e5e17ad9f1acb44659': directory_hash, 'd2164061453ecb03d4347a05a77db83f706b8e15': directory_hash, '34192ceef239b8b72141efcc58b1d7f1676a18c9': directory_hash, '2652147529269778757d96e09aaf081695548218': directory_hash, '4d640e8064fe69b4c851dfd43915c431e80c7497': directory_hash, 'c313df50bfcaa773dcbe038d00f8bd770ba997f8': directory_hash, '769db00b34b9e085dc699c8f1550c95793d0e904': directory_hash, '2973e5dc9568ac491b198f6b7f10c44ddc04e0a3': directory_hash, 'be34b8c7857a6c04e41cc06b26338d8e59cb2601': directory_hash, '24f45e41637240b7f9e16d2791b5eacb4a406d0f': directory_hash, '62ff4741eac1821190f6c2cdab7c8a9d7db64ad0': directory_hash, 'c346f6ff7f42f2a8ff867f92ab83a6721057d86c': directory_hash, 'f2afbb94b319ef5d60823859875284afb95dcc18': directory_hash, '4e2dc6d6073f0b6d348f84ded52f9143b10344b9': directory_hash, '31cd7c5f669868651c57e3a2ba25ac45f76fa5cf': directory_hash, '25f5b27dfa5ed15d336188ef46bef743d88327d4': directory_hash, '88b80615ed8561be74a700b92883ec0374ddacb0': directory_hash, '5ee9ea92ed8cc1737b7670e39dab6081c64f2598': directory_hash, 'dcddcc32740d2de0e1403e21a5c4ed837b352992': directory_hash, '74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b': directory_hash, 'cb36b894129ca7910bb81c457c72d69d5ff111bc': directory_hash, 'caef0cb155eb6c55215aa59aabe04a9c702bbe6a': directory_hash, '5017ce0b285351da09a2029ea2cf544f79b593c7': directory_hash, '17a62618eb6e91a1d5d8e1246ccedae020d3b222': directory_hash, 'a1f000fb8216838aa2a120738cc6c7fef2d1b4d8': directory_hash, '9f82d95bd3edfb7f18b1a21d6171170395ea44ce': directory_hash, 'a701d39a17a9f48c61a06eee08bd9ac0b8e3838b': directory_hash, '4ef794980f820d44be94b2f0d53eb34d4241638c': directory_hash, 'ddecbc16f4c916c39eacfcb2302e15a9e70a231e': directory_hash, '3565e7d385af0745ec208d719e469c2f58be8e94': directory_hash, 'c875bad563a73a25c5f3379828b161b1441a7c5d': directory_hash, '94be9abcf9558213ff301af0ecd8223451ce991d': directory_hash, '1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0': directory_hash, '553b09724bd30d9691b290e157b27a73e2d3e537': directory_hash, '9e912851eb64e3a1e08fbb587de7a4c897ce5a0a': directory_hash, '9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf': directory_hash, 'db9e625ba90056304897a94c92e5d27bc60f112d': directory_hash, '2d4a801c9a9645fcd3a9f4c06418d8393206b1f3': directory_hash, 'e874cd5967efb1f45282e9f5ce87cc68a898a6d0': directory_hash, 'e326a7bbb5bc00f1d8cacd6108869dedef15569c': directory_hash, '3ed4b85d30401fe32ae3b1d650f215a588293a9e': directory_hash, tip_revision_develop: directory_hash, } self.assertRevisionsContain(expected_revisions) self.assertCountSnapshots(1) expected_snapshot = { 'id': '3b8fe58e467deb7597b12a5fd3b2c096b8c02028', 'branches': { 'develop': { 'target': tip_revision_develop, 'target_type': 'revision' }, 'default': { 'target': tip_revision_default, 'target_type': 'revision' }, 'HEAD': { 'target': 'develop', 'target_type': 'alias', } } } self.assertSnapshotEqual(expected_snapshot) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') class LoaderITest2(BaseHgLoaderTest): """Load a mercurial repository with release """ def setUp(self): super().setUp(archive_name='hello.tgz', filename='hello') def test_load(self): """Load a repository with tags results in 1 snapshot """ # when self.loader.load( origin_url=self.repo_url, visit_date='2016-05-03 15:16:32+00', directory=self.destination_path) # then self.assertCountContents(3) self.assertCountDirectories(3) self.assertCountReleases(1) self.assertCountRevisions(3) tip_release = '515c4d72e089404356d0f4b39d60f948b8999140' self.assertReleasesContain([tip_release]) tip_revision_default = 'c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27' # cf. test_loader.org for explaining from where those hashes # come from expected_revisions = { # revision hash | directory hash # noqa '93b48d515580522a05f389bec93227fc8e43d940': '43d727f2f3f2f7cb3b098ddad1d7038464a4cee2', # noqa '8dd3db5d5519e4947f035d141581d304565372d2': 'b3f85f210ff86d334575f64cb01c5bf49895b63e', # noqa tip_revision_default: '8f2be433c945384c85920a8e60f2a68d2c0f20fb', } self.assertRevisionsContain(expected_revisions) self.assertCountSnapshots(1) expected_snapshot = { 'id': 'd35668e02e2ba4321dc951cd308cf883786f918a', 'branches': { 'default': { 'target': tip_revision_default, 'target_type': 'revision' }, '0.1': { 'target': tip_release, 'target_type': 'release' }, 'HEAD': { 'target': 'default', 'target_type': 'alias', } } } self.assertSnapshotEqual(expected_snapshot) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') diff --git a/swh/loader/mercurial/loader_verifier.py b/swh/loader/mercurial/tests/test_loader_verifier.py similarity index 74% rename from swh/loader/mercurial/loader_verifier.py rename to swh/loader/mercurial/tests/test_loader_verifier.py index 5fb2497..ed2d4d6 100644 --- a/swh/loader/mercurial/loader_verifier.py +++ b/swh/loader/mercurial/tests/test_loader_verifier.py @@ -1,238 +1,249 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click import code import datetime import hglib import logging import os import random import sys import time +import urllib.parse from binascii import hexlify, unhexlify - from swh.model.hashutil import MultiHash +from ..converters import PRIMARY_ALGO as ALGO +from ..objects import SimpleTree + +from swh.loader.core.tests import BaseLoaderTest + +from .common import HgLoaderMemoryStorage -from .loader import HgBundle20Loader -from .converters import PRIMARY_ALGO as ALGO -from .objects import SimpleTree +class HgLoaderValidater: + """Loader validater + + """ + def __init__(self, loader): + self.loader = loader -class HgLoaderValidater(HgBundle20Loader): def generate_all_blobs(self, validate=True, frequency=1): logging.debug('GENERATING BLOBS') i = 0 start = time.time() u = set() - for blob, node_info in self.br.yield_all_blobs(): + for blob, node_info in self.loader.br.yield_all_blobs(): filename = node_info[0] header = node_info[2] i += 1 hashes = MultiHash.from_data(blob, hash_names=set([ALGO])).digest() bhash = hashes[ALGO] self.file_node_to_hash[header['node']] = bhash u.update([bhash]) if validate: if random.random() < frequency: self.validate_blob(filename, header, blob) if i % 10000 == 0: logging.debug(i) logging.debug('\nFOUND %s BLOBS' % i) logging.debug('FOUND: %s UNIQUE BLOBS' % len(u)) logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_blob(self, filename, header, blob): if not self.hg: self.hg = hglib.open(self.hgdir) data = bytes(blob) filepath = os.path.join(self.hg.root(), bytes(filename)) linknode = hexlify(header['linknode']) cat_contents = self.hg.cat([filepath], rev=linknode) if cat_contents != data: logging.debug('INTERNAL ERROR ERROR ERROR ERROR') logging.debug(filename) logging.debug(header) logging.debug('-----') logging.debug(cat_contents) logging.debug('---- vs ----') logging.debug(data) code.interact(local=dict(globals(), **locals())) quit() else: logging.debug('v', end='') def generate_all_trees(self, validate=True, frequency=1): logging.debug('GENERATING MANIFEST TREES') c = 0 n = 0 u = set() start = time.time() validated = 0 - for header, tree, new_dirs in self.load_directories(): + for header, tree, new_dirs in self.loader.load_directories(): if validate and (c >= validated) and (random.random() < frequency): self.validate_tree(tree, header, c) for d in new_dirs: u.add(d['id']) c += 1 n += len(new_dirs) logging.debug('.', end='') if c % 20 == 0: sys.stdout.flush() if c % 10000 == 0: logging.debug(c) logging.debug('\nFOUND: %s COMMIT MANIFESTS' % c) logging.debug('FOUND: %s NEW DIRS' % n) logging.debug('FOUND: %s UNIQUE DIRS' % len(u)) logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_tree(self, tree, header, i): if not self.hg: self.hg = hglib.open(self.hgdir) commit_id = header['linknode'] if len(commit_id) == 20: commit_id = hexlify(commit_id) base_tree = SimpleTree() base_files = list(self.hg.manifest(rev=commit_id)) bfiles = sorted([f[4] for f in base_files]) for p in base_files: base_tree.add_blob( p[4], self.file_node_to_hash[unhexlify(p[0])], p[3], p[1] ) base_tree.hash_changed() files = sorted(list(tree.flatten().keys())) if tree != base_tree: logging.debug('validating rev: %s commit: %s' % (i, commit_id)) logging.debug('validating files: %s %s INVALID TREE' % ( len(files), len(base_files))) def so1(a): keys = [k['name'] for k in a['entries']] return b''.join(sorted(keys)) tree_dirs = [d for d in tree.yield_swh_directories()] base_dirs = [d for d in base_tree.yield_swh_directories()] tree_dirs.sort(key=so1) base_dirs.sort(key=so1) logging.debug('Program will quit after your next Ctrl-D') code.interact(local=dict(globals(), **locals())) quit() else: logging.debug('v') def generate_all_commits(self, validate=True, frequency=1): i = 0 start = time.time() for rev in self.get_revisions(): logging.debug('.', end='') i += 1 if i % 20 == 0: sys.stdout.flush() logging.debug('') logging.debug('\nFOUND: %s COMMITS' % i) logging.debug('ELAPSED: %s' % (time.time()-start)) def runtest(self, hgdir, validate_blobs=False, validate_trees=False, - frequency=1.0, test_iterative=False): - """HgLoaderValidater().runtest('/home/avi/SWH/mozilla-unified') + frequency=1.0): + """loader = HgLoaderMemoryStorage(0 + HgLoaderValidater(loader).runtest('/home/avi/SWH/mozilla-unified') """ self.origin_id = 'test' dt = datetime.datetime.now(tz=datetime.timezone.utc) - if test_iterative: - dt = dt - datetime.timedelta(10) hgrepo = None if (hgdir.lower().startswith('http:') or hgdir.lower().startswith('https:')): hgrepo, hgdir = hgdir, hgrepo self.hgdir = hgdir try: logging.debug('preparing') - self.prepare(origin_url=hgrepo, visit_date=dt, directory=hgdir) + self.loader.prepare( + origin_url=hgrepo, visit_date=dt, directory=hgdir) self.file_node_to_hash = {} logging.debug('getting contents') cs = 0 - for c in self.get_contents(): + for c in self.loader.get_contents(): cs += 1 pass logging.debug('getting directories') ds = 0 - for d in self.get_directories(): + for d in self.loader.get_directories(): ds += 1 pass revs = 0 logging.debug('getting revisions') - for rev in self.get_revisions(): + for rev in self.loader.get_revisions(): revs += 1 pass logging.debug('getting releases') rels = 0 - for rel in self.get_releases(): + for rel in self.loader.get_releases(): rels += 1 logging.debug(rel) self.visit = 'foo' + snps = 0 logging.debug('getting snapshot') - o = self.get_snapshot() + o = self.loader.get_snapshot() logging.debug('Snapshot: %s' % o) + if o: + snps += 1 finally: - self.cleanup() - - logging.info('final count: cs %s ds %s revs %s rels %s' % ( - cs, ds, revs, rels)) - - -@click.command() -@click.option('--verbose', is_flag=True, default=False) -@click.option('--validate-frequency', default=0.001, type=click.FLOAT) -@click.option('--test-iterative', default=False, type=click.BOOL) -@click.argument('repository-url', required=1) -def main(verbose, validate_frequency, test_iterative, repository_url): - logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) - while repository_url[-1] == '/': - repository_url = repository_url[:-1] - - HgLoaderValidater().runtest( - repository_url, - validate_blobs=True, validate_trees=True, - frequency=validate_frequency, - test_iterative=test_iterative) - - -if __name__ == '__main__': - main() + self.loader.cleanup() + + return cs, ds, revs, rels, snps + + +class LoaderVerifierTest(BaseLoaderTest): + def setUp(self, archive_name='the-sandbox.tgz', filename='the-sandbox'): + super().setUp(archive_name=archive_name, filename=filename, + prefix_tmp_folder_name='swh.loader.mercurial.', + start_path=os.path.dirname(__file__)) + loader = HgLoaderMemoryStorage() + self.validator = HgLoaderValidater(loader) + + def test_data(self): + repo_path = urllib.parse.urlparse(self.repo_url).path + cs, ds, revs, rels, snps = self.validator.runtest( + repo_path, + validate_blobs=True, + validate_trees=True, + frequency=0.001) + + self.assertEqual(cs, 2) + self.assertEqual(ds, 3) + self.assertEqual(revs, 58) + self.assertEqual(rels, 0) + self.assertEqual(snps, 1)