diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -11,25 +11,13 @@ cls: remote args: url: http://localhost:5002/ - -send_contents: True -send_directories: True -send_revisions: True -send_releases: True -send_occurrences: True -content_packet_size: 1000 -content_packet_size_bytes: 1073741824 -directory_packet_size: 2500 -revision_packet_size: 1000 -release_packet_size: 1000 -occurrence_packet_size: 1000 ``` # Basic use From python3's toplevel: -## Remote (failure) +## Remote ``` Python project = 'hello' @@ -47,7 +35,7 @@ t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` -## local directory (failure) +## local directory Only origin, contents, and directories are filled so far. @@ -61,13 +49,13 @@ import logging logging.basicConfig(level=logging.DEBUG) -from swh.loader.mercurial.tasks import SlowLoadMercurialTsk +from swh.loader.mercurial.tasks import LoadMercurialTsk -t = SlowLoadMercurialTsk() +t = LoadMercurialTsk() t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00') ``` -## local archive (failure) +## local archive ``` Python project = '756015-ipv6-source-archive.zip' @@ -77,8 +65,8 @@ import logging logging.basicConfig(level=logging.DEBUG) -from swh.loader.mercurial.tasks import SlowLoadMercurialArchiveTsk +from swh.loader.mercurial.tasks import LoadArchiveMercurialTsk -t = SlowLoadMercurialArchiveTsk() +t = LoadArchiveMercurialTsk() t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00') ``` diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -13,7 +13,7 @@ python3-hglib, patool, python3-swh.core (>= 0.0.36~), - python3-swh.model (>= 0.0.20~), + python3-swh.model (>= 0.0.27~), python3-swh.storage (>= 0.0.95~), python3-swh.scheduler (>= 0.0.19~), python3-swh.loader.core (>= 0.0.33~), @@ -25,7 +25,7 @@ Architecture: all Depends: python3-swh.core (>= 0.0.36~), python3-swh.loader.core (>= 0.0.33~), - python3-swh.model (>= 0.0.20~), + python3-swh.model (>= 0.0.27~), python3-swh.storage (>= 0.0.95~), python3-swh.scheduler (>= 0.0.19~), patool, diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.36 -swh.model >= 0.0.20 +swh.model >= 0.0.27 swh.storage >= 0.0.95 swh.scheduler >= 0.0.19 swh.loader.core >= 0.0.33 diff --git a/swh/loader/mercurial/bundle20_loader.py b/swh/loader/mercurial/bundle20_loader.py --- a/swh/loader/mercurial/bundle20_loader.py +++ b/swh/loader/mercurial/bundle20_loader.py @@ -27,7 +27,11 @@ from shutil import rmtree from tempfile import mkdtemp -from swh.model import hashutil, identifiers +from swh.model import identifiers +from swh.model.hashutil import ( + MultiHash, hash_to_hex, hash_to_bytes, + DEFAULT_ALGORITHMS +) from swh.loader.core.loader import SWHStatelessLoader from swh.loader.core.converters import content_for_storage from swh.loader.core.utils import clean_dangling_folders @@ -100,15 +104,14 @@ """ b = {} for _, node_hash_id, _, branch_name, *_ in repo.heads(): - b[branch_name] = hashutil.hash_to_bytes( + b[branch_name] = hash_to_bytes( node_hash_id.decode()) bookmarks = repo.bookmarks() if bookmarks and bookmarks[0]: for bookmark_name, _, target_short in bookmarks[0]: target = repo[target_short].node() - b[bookmark_name] = hashutil.hash_to_bytes( - target.decode()) + b[bookmark_name] = hash_to_bytes(target.decode()) return b @@ -225,12 +228,14 @@ file_name = node_info[0] header = node_info[2] + length = len(blob) if header['linknode'] in self.reduce_effort: - content = hashutil.hash_data(blob, algorithms=[ALGO], - with_length=True) + algorithms = [ALGO] else: - content = hashutil.hash_data(blob, with_length=True) - + algorithms = DEFAULT_ALGORITHMS + h = MultiHash.from_data(blob, hash_names=algorithms, length=length) + content = h.digest() + content['length'] = length blob_hash = content[ALGO] self.file_node_to_hash[header['node']] = blob_hash @@ -273,7 +278,6 @@ content = contents.pop(node_hashes[node], None) if content: content['data'] = blob - content['length'] = len(blob) yield content_for_storage( content, log=self.log, @@ -380,7 +384,7 @@ 'directory': directory_id, 'message': commit['message'], 'metadata': { - 'node': hashutil.hash_to_hex(header['node']), + 'node': hash_to_hex(header['node']), 'extra_headers': [ ['time_offset_seconds', str(commit['time_offset_seconds']).encode('utf-8')], @@ -397,7 +401,7 @@ if p2: revision['parents'].append(p2) - revision['id'] = hashutil.hash_to_bytes( + revision['id'] = hash_to_bytes( identifiers.revision_identifier(revision) ) self.node_2_rev[header['node']] = revision['id'] @@ -433,7 +437,7 @@ self.num_releases += 1 node, name = self._read_tag(t) node = node.decode() - node_bytes = hashutil.hash_to_bytes(node) + node_bytes = hash_to_bytes(node) if not TAG_PATTERN.match(node): self.log.warn('Wrong pattern (%s) found in tags. Skipping' % ( node, )) @@ -454,7 +458,7 @@ 'author': {'name': None, 'email': None, 'fullname': b''}, 'date': None } - id_hash = hashutil.hash_to_bytes( + id_hash = hash_to_bytes( identifiers.release_identifier(release)) release['id'] = id_hash missing_releases.append(id_hash) diff --git a/swh/loader/mercurial/bundle20_loader_verifier.py b/swh/loader/mercurial/bundle20_loader_verifier.py --- a/swh/loader/mercurial/bundle20_loader_verifier.py +++ b/swh/loader/mercurial/bundle20_loader_verifier.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,7 +13,7 @@ from binascii import hexlify, unhexlify -from swh.model import hashutil +from swh.model.hashutil import MultiHash from .bundle20_loader import HgBundle20Loader from .converters import PRIMARY_ALGO as ALGO @@ -31,7 +31,8 @@ header = node_info[2] i += 1 - bhash = hashutil.hash_data(blob, algorithms=set([ALGO]))[ALGO] + hashes = MultiHash.from_data(blob, hash_names=set([ALGO])).digest() + bhash = hashes[ALGO] self.file_node_to_hash[header['node']] = bhash u.update([bhash]) diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py --- a/swh/loader/mercurial/slow_loader.py +++ b/swh/loader/mercurial/slow_loader.py @@ -11,7 +11,8 @@ import hglib import os -from swh.model import identifiers, hashutil +from swh.model import identifiers +from swh.model.hashutil import MultiHash, DEFAULT_ALGORITHMS, hash_to_hex from swh.loader.core.loader import SWHStatelessLoader from .converters import parse_author, PRIMARY_ALGO as ALGO @@ -51,8 +52,9 @@ content.update(existing_hashes) hash_types = list(existing_hashes.keys()) - hashes_to_do = hashutil.DEFAULT_ALGORITHMS.difference(hash_types) - content.update(hashutil.hash_data(data, algorithms=hashes_to_do)) + hashes_to_do = DEFAULT_ALGORITHMS.difference(hash_types) + hashes = MultiHash.from_data(data, hash_names=hashes_to_do).digest() + content.update(hashes) if max_size and (size > max_size): content.update({ @@ -60,7 +62,7 @@ 'reason': 'Content too large', }) if logger: - id_hash = hashutil.hash_to_hex(content[ALGO]) + id_hash = hash_to_hex(content[ALGO]) logger.info( 'Skipping content %s, too large (%s > %s)' % (id_hash, size, max_size),