diff --git a/swh/loader/mercurial/tests/test_loader_verifier.py b/swh/loader/mercurial/tests/test_loader_verifier.py index ed2d4d6..e95aba8 100644 --- a/swh/loader/mercurial/tests/test_loader_verifier.py +++ b/swh/loader/mercurial/tests/test_loader_verifier.py @@ -1,249 +1,270 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import code import datetime import hglib import logging import os import random import sys import time import urllib.parse from binascii import hexlify, unhexlify from swh.model.hashutil import MultiHash from ..converters import PRIMARY_ALGO as ALGO from ..objects import SimpleTree from swh.loader.core.tests import BaseLoaderTest from .common import HgLoaderMemoryStorage class HgLoaderValidater: """Loader validater """ def __init__(self, loader): self.loader = loader def generate_all_blobs(self, validate=True, frequency=1): logging.debug('GENERATING BLOBS') i = 0 start = time.time() u = set() for blob, node_info in self.loader.br.yield_all_blobs(): filename = node_info[0] header = node_info[2] i += 1 hashes = MultiHash.from_data(blob, hash_names=set([ALGO])).digest() bhash = hashes[ALGO] self.file_node_to_hash[header['node']] = bhash u.update([bhash]) if validate: if random.random() < frequency: self.validate_blob(filename, header, blob) if i % 10000 == 0: logging.debug(i) logging.debug('\nFOUND %s BLOBS' % i) logging.debug('FOUND: %s UNIQUE BLOBS' % len(u)) logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_blob(self, filename, header, blob): if not self.hg: self.hg = hglib.open(self.hgdir) data = bytes(blob) filepath = os.path.join(self.hg.root(), bytes(filename)) linknode = hexlify(header['linknode']) cat_contents = self.hg.cat([filepath], rev=linknode) if cat_contents != data: logging.debug('INTERNAL ERROR ERROR ERROR ERROR') logging.debug(filename) logging.debug(header) logging.debug('-----') logging.debug(cat_contents) logging.debug('---- vs ----') logging.debug(data) code.interact(local=dict(globals(), **locals())) quit() else: logging.debug('v', end='') def generate_all_trees(self, validate=True, frequency=1): logging.debug('GENERATING MANIFEST TREES') c = 0 n = 0 u = set() start = time.time() validated = 0 for header, tree, new_dirs in self.loader.load_directories(): if validate and (c >= validated) and (random.random() < frequency): self.validate_tree(tree, header, c) for d in new_dirs: u.add(d['id']) c += 1 n += len(new_dirs) logging.debug('.', end='') if c % 20 == 0: sys.stdout.flush() if c % 10000 == 0: logging.debug(c) logging.debug('\nFOUND: %s COMMIT MANIFESTS' % c) logging.debug('FOUND: %s NEW DIRS' % n) logging.debug('FOUND: %s UNIQUE DIRS' % len(u)) logging.debug('ELAPSED: %s' % (time.time()-start)) def validate_tree(self, tree, header, i): if not self.hg: self.hg = hglib.open(self.hgdir) commit_id = header['linknode'] if len(commit_id) == 20: commit_id = hexlify(commit_id) base_tree = SimpleTree() base_files = list(self.hg.manifest(rev=commit_id)) bfiles = sorted([f[4] for f in base_files]) for p in base_files: base_tree.add_blob( p[4], self.file_node_to_hash[unhexlify(p[0])], p[3], p[1] ) base_tree.hash_changed() files = sorted(list(tree.flatten().keys())) if tree != base_tree: logging.debug('validating rev: %s commit: %s' % (i, commit_id)) logging.debug('validating files: %s %s INVALID TREE' % ( len(files), len(base_files))) def so1(a): keys = [k['name'] for k in a['entries']] return b''.join(sorted(keys)) tree_dirs = [d for d in tree.yield_swh_directories()] base_dirs = [d for d in base_tree.yield_swh_directories()] tree_dirs.sort(key=so1) base_dirs.sort(key=so1) logging.debug('Program will quit after your next Ctrl-D') code.interact(local=dict(globals(), **locals())) quit() else: logging.debug('v') def generate_all_commits(self, validate=True, frequency=1): i = 0 start = time.time() for rev in self.get_revisions(): logging.debug('.', end='') i += 1 if i % 20 == 0: sys.stdout.flush() logging.debug('') logging.debug('\nFOUND: %s COMMITS' % i) logging.debug('ELAPSED: %s' % (time.time()-start)) def runtest(self, hgdir, validate_blobs=False, validate_trees=False, frequency=1.0): """loader = HgLoaderMemoryStorage(0 HgLoaderValidater(loader).runtest('/home/avi/SWH/mozilla-unified') """ self.origin_id = 'test' dt = datetime.datetime.now(tz=datetime.timezone.utc) hgrepo = None if (hgdir.lower().startswith('http:') or hgdir.lower().startswith('https:')): hgrepo, hgdir = hgdir, hgrepo self.hgdir = hgdir try: logging.debug('preparing') self.loader.prepare( origin_url=hgrepo, visit_date=dt, directory=hgdir) self.file_node_to_hash = {} logging.debug('getting contents') cs = 0 for c in self.loader.get_contents(): cs += 1 pass logging.debug('getting directories') ds = 0 for d in self.loader.get_directories(): ds += 1 pass revs = 0 logging.debug('getting revisions') for rev in self.loader.get_revisions(): revs += 1 pass logging.debug('getting releases') rels = 0 for rel in self.loader.get_releases(): rels += 1 logging.debug(rel) self.visit = 'foo' snps = 0 logging.debug('getting snapshot') o = self.loader.get_snapshot() logging.debug('Snapshot: %s' % o) if o: snps += 1 finally: self.loader.cleanup() return cs, ds, revs, rels, snps -class LoaderVerifierTest(BaseLoaderTest): +class BaseLoaderVerifierTest(BaseLoaderTest): def setUp(self, archive_name='the-sandbox.tgz', filename='the-sandbox'): super().setUp(archive_name=archive_name, filename=filename, prefix_tmp_folder_name='swh.loader.mercurial.', start_path=os.path.dirname(__file__)) loader = HgLoaderMemoryStorage() self.validator = HgLoaderValidater(loader) + +class LoaderVerifierTest1(BaseLoaderVerifierTest): def test_data(self): repo_path = urllib.parse.urlparse(self.repo_url).path cs, ds, revs, rels, snps = self.validator.runtest( repo_path, validate_blobs=True, validate_trees=True, frequency=0.001) self.assertEqual(cs, 2) self.assertEqual(ds, 3) self.assertEqual(revs, 58) self.assertEqual(rels, 0) self.assertEqual(snps, 1) + + +class LoaderVerifierTest2(BaseLoaderVerifierTest): + def setUp(self, archive_name='hello.tgz', filename='hello'): + super().setUp(archive_name=archive_name, filename=filename) + + def test_data(self): + repo_path = urllib.parse.urlparse(self.repo_url).path + cs, ds, revs, rels, snps = self.validator.runtest( + repo_path, + validate_blobs=True, + validate_trees=True, + frequency=0.001) + + self.assertEqual(cs, 3) + self.assertEqual(ds, 3) + self.assertEqual(rels, 1) + self.assertEqual(revs, 3) + self.assertEqual(snps, 1)