diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1 +1,2 @@ swh.core[http] >= 0.0.63 +swh.model diff --git a/swh/graph/cli.py b/swh/graph/cli.py --- a/swh/graph/cli.py +++ b/swh/graph/cli.py @@ -1,7 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import click +import sys from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.graph import client +from swh.graph.pid import PidToIntMap, IntToPidMap @click.group(name='graph', context_settings=CONTEXT_SETTINGS, @@ -27,6 +34,78 @@ print(app.stats()) +@cli.group('map') +@click.pass_context +def map(ctx): + """Manage swh-graph on-disk maps""" + pass + + +def dump_pid2int(filename): + for (pid, int) in PidToIntMap(filename): + print('{}\t{}'.format(pid, int)) + + +def dump_int2pid(filename): + for (int, pid) in enumerate(IntToPidMap(filename)): + print('{}\t{}'.format(int, pid)) + + +def restore_pid2int(filename): + """read a textual PID->int map from stdin and write its binary version to + filename + + """ + with open(filename, 'wb') as dst: + for line in sys.stdin: + (str_pid, str_int) = line.split() + PidToIntMap.write_record(dst, str_pid, int(str_int)) + + +def restore_int2pid(filename): + """read a textual int->PID map from stdin and write its binary version to + filename + + """ + with open(filename, 'wb') as dst: + for line in sys.stdin: + (_str_int, str_pid) = line.split() + IntToPidMap.write_record(dst, str_pid) + + +@map.command('dump') +@click.option('--type', '-t', 'map_type', required=True, + type=click.Choice(['pid2int', 'int2pid']), + help='type of map to dump') +@click.argument('filename', required=True, type=click.Path(exists=True)) +@click.pass_context +def dump_map(ctx, map_type, filename): + """dump a binary PID<->int map to textual format""" + if map_type == 'pid2int': + dump_pid2int(filename) + elif map_type == 'int2pid': + dump_int2pid(filename) + else: + raise ValueError('invalid map type: ' + map_type) + pass + + +@map.command('restore') +@click.option('--type', '-t', 'map_type', required=True, + type=click.Choice(['pid2int', 'int2pid']), + help='type of map to dump') +@click.argument('filename', required=True, type=click.Path()) +@click.pass_context +def restore_map(ctx, map_type, filename): + """restore a binary PID<->int map from textual format""" + if map_type == 'pid2int': + restore_pid2int(filename) + elif map_type == 'int2pid': + restore_int2pid(filename) + else: + raise ValueError('invalid map type: ' + map_type) + + def main(): return cli(auto_envvar_prefix='SWH_GRAPH') diff --git a/swh/graph/pid.py b/swh/graph/pid.py new file mode 100644 --- /dev/null +++ b/swh/graph/pid.py @@ -0,0 +1,296 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import mmap +import os +import struct + +from collections.abc import Mapping, Sequence +from enum import Enum + +from swh.model.identifiers import PersistentId, parse_persistent_identifier + + +PID_BIN_FMT = 'BB20s' # 2 unsigned chars + 20 bytes +INT_BIN_FMT = '>q' # big endian, 8-byte integer +PID_BIN_SIZE = 22 # in bytes +INT_BIN_SIZE = 8 # in bytes + + +class PidType(Enum): + """types of existing PIDs, used to serialize PID type as a (char) integer + + note that the order does matter also for driving the binary search in + PID-indexed maps + + """ + content = 1 + directory = 2 + origin = 3 + release = 4 + revision = 5 + snapshot = 6 + + +def str_to_bytes(pid): + """Convert a PID to a byte sequence + + The binary format used to represent PIDs as byte sequences is as follows: + + - 1 byte for the namespace version represented as a C `unsigned char` + - 1 byte for the object type, as the int value of :class:`PidType` enums, + represented as a C `unsigned char` + - 20 bytes for the SHA1 digest as a byte sequence + + Args: + pid (str): persistent identifier + + Returns: + bytes (bytes): byte sequence representation of pid + + """ + pid = parse_persistent_identifier(pid) + return struct.pack(PID_BIN_FMT, pid.scheme_version, + PidType[pid.object_type].value, + bytes.fromhex(pid.object_id)) + + +def bytes_to_str(bytes): + """Inverse function of :func:`str_to_bytes` + + See :func:`str_to_bytes` for a description of the binary PID format. + + Args: + bytes (bytes): byte sequence representation of pid + + Returns: + pid (str): persistent identifier + + """ + (version, type, bin_digest) = struct.unpack(PID_BIN_FMT, bytes) + pid = PersistentId(object_type=PidType(type).name, object_id=bin_digest) + return str(pid) + + +class _OnDiskMap(): + """mmap-ed on-disk sequence of fixed size records + + """ + + def __init__(self, record_size, fname): + """open an existing on-disk map + + Args: + record_size (int): size of each record in bytes + fname (str): path to the on-disk map + + """ + self.record_size = record_size + self.size = os.path.getsize(fname) + (self.length, remainder) = divmod(self.size, record_size) + if remainder: + raise ValueError( + 'map size {} is not a multiple of the record size {}'.format( + self.size, record_size)) + + self.f = open(fname, 'rb') + self.mm = mmap.mmap(self.f.fileno(), self.size, mmap.MAP_PRIVATE) + + def close(self): + """close the map + + shuts down both the mmap and the underlying file descriptor + + """ + if not self.mm.closed: + self.mm.close() + if not self.f.closed: + self.f.close() + + +class PidToIntMap(_OnDiskMap, Mapping): + """memory mapped map from PID (:ref:`persistent-identifiers`) to a continuous + range 0..N of (8-byte long) integers + + This is the converse mapping of :class:`IntToPidMap`. + + The on-disk serialization format is a sequence of fixed length (30 bytes) + records with the following fields: + + - PID (22 bytes): binary PID representation as per :func:`str_to_bytes` + - long (8 bytes): big endian long integer + + The records are sorted lexicographically by PID type and checksum, where + type is the integer value of :class:`PidType`. PID lookup in the map is + performed via binary search. Hence a huge map with, say, 11 B entries, + will require ~30 disk seeks. + + """ + + # record binary format: PID + a big endian 8-byte big endian integer + RECORD_BIN_FMT = '>' + PID_BIN_FMT + 'q' + RECORD_SIZE = PID_BIN_SIZE + INT_BIN_SIZE + + def __init__(self, fname): + """open an existing on-disk map + + Args: + fname (str): path to the on-disk map + + """ + super().__init__(self.RECORD_SIZE, fname) + + def _get_bin_record(self, pos): + """seek and return the (binary) record at a given (logical) position + + see :func:`_get_record` for an equivalent function with additional + deserialization + + Args: + pos: 0-based record number + + Returns: + tuple: a pair `(pid, int)`, where pid and int are bytes + + """ + rec_pos = pos * self.RECORD_SIZE + int_pos = rec_pos + PID_BIN_SIZE + + return (self.mm[rec_pos:int_pos], + self.mm[int_pos:int_pos+INT_BIN_SIZE]) + + def _get_record(self, pos): + """seek and return the record at a given (logical) position + + moral equivalent of :func:`_get_bin_record`, with additional + deserialization to non-bytes types + + Args: + pos: 0-based record number + + Returns: + tuple: a pair `(pid, int)`, where pid is a string-based PID and int + an integer + + """ + (pid_bytes, int_bytes) = self._get_bin_record(pos) + return (bytes_to_str(pid_bytes), + struct.unpack(INT_BIN_FMT, int_bytes)[0]) + + @classmethod + def write_record(cls, f, pid, int): + """write a logical record to a file-like object + + Args: + f: file-like object to write the record to + pid (str): textual PID + int (int): PID integer identifier + + """ + f.write(str_to_bytes(pid)) + f.write(struct.pack(INT_BIN_FMT, int)) + + def __getitem__(self, pid_str): + """lookup the integer identifier of a PID + + Args: + pid (str): the PID as a string + + Returns: + int: the integer identifier of pid + + """ + if not isinstance(pid_str, str): + raise TypeError('PID must be a str, not ' + type(pid_str)) + try: + target = str_to_bytes(pid_str) # desired PID as bytes + except ValueError: + raise ValueError('invalid PID: "{}"'.format(pid_str)) + + min = 0 + max = self.length - 1 + while (min <= max): + mid = (min + max) // 2 + (pid, int) = self._get_bin_record(mid) + if pid < target: + min = mid + 1 + elif pid > target: + max = mid - 1 + else: # pid == target + return struct.unpack(INT_BIN_FMT, int)[0] + + raise KeyError(pid_str) + + def __iter__(self): + for pos in range(self.length): + yield self._get_record(pos) + + def __len__(self): + return self.length + + +class IntToPidMap(_OnDiskMap, Sequence): + """memory mapped map from a continuous range of 0..N (8-byte long) integers to + PIDs (:ref:`persistent-identifiers`) + + This is the converse mapping of :class:`PidToIntMap`. + + The on-disk serialization format is a sequence of fixed length (22 bytes), + where each record is the binary representation of PID as per + :func:`str_to_bytes`. + + The records are sorted by long integer, so that integer lookup is possible + via fixed-offset seek. + + """ + + RECORD_BIN_FMT = PID_BIN_FMT + RECORD_SIZE = PID_BIN_SIZE + + def __init__(self, fname): + """open an existing on-disk map + + Args: + fname (str): path to the on-disk map + + """ + super().__init__(self.RECORD_SIZE, fname) + + def _get_bin_record(self, pos): + """seek and return the (binary) PID at a given (logical) position + + Args: + pos: 0-based record number + + Returns: + bytes: PID as a byte sequence + + """ + rec_pos = pos * self.RECORD_SIZE + + return self.mm[rec_pos:rec_pos+self.RECORD_SIZE] + + @classmethod + def write_record(cls, f, pid): + """write a PID to a file-like object + + Args: + f: file-like object to write the record to + pid (str): textual PID + + """ + f.write(str_to_bytes(pid)) + + def __getitem__(self, pos): + orig_pos = pos + if pos < 0: + pos = len(self) + pos + if not (0 <= pos < len(self)): + raise IndexError(orig_pos) + + return bytes_to_str(self._get_bin_record(pos)) + + def __len__(self): + return self.length diff --git a/swh/graph/tests/test_pid.py b/swh/graph/tests/test_pid.py new file mode 100644 --- /dev/null +++ b/swh/graph/tests/test_pid.py @@ -0,0 +1,159 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil +import tempfile +import unittest + +from swh.graph.pid import str_to_bytes, bytes_to_str +from swh.graph.pid import PidToIntMap, IntToPidMap + + +class TestPidSerialization(unittest.TestCase): + + pairs = [ + ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', + bytes.fromhex('01' + '01' + + '94a9ed024d3859793618152ea559a168bbcbb5e2')), + ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', + bytes.fromhex('01' + '02' + + 'd198bc9d7a6bcf6db04f476d29314f157507d505')), + ('swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f', + bytes.fromhex('01' + '03' + + 'b63a575fe3faab7692c9f38fb09d4bb45651bb0f')), + ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', + bytes.fromhex('01' + '04' + + '22ece559cc7cc2364edc5e5593d63ae8bd229f9f')), + ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', + bytes.fromhex('01' + '05' + + '309cf2674ee7a0749978cf8265ab91a60aea0f7d')), + ('swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', + bytes.fromhex('01' + '06' + + 'c7c108084bc0bf3d81436bf980b46e98bd338453')), + ] + + def test_str_to_bytes(self): + for (pid_str, pid_bytes) in self.pairs: + self.assertEqual(str_to_bytes(pid_str), pid_bytes) + + def test_bytes_to_str(self): + for (pid_str, pid_bytes) in self.pairs: + self.assertEqual(bytes_to_str(pid_bytes), pid_str) + + def test_round_trip(self): + for (pid_str, pid_bytes) in self.pairs: + self.assertEqual(pid_str, bytes_to_str(str_to_bytes(pid_str))) + self.assertEqual(pid_bytes, str_to_bytes(bytes_to_str(pid_bytes))) + + +def gen_records(types=['ori', 'snp', 'rev', 'rel', 'dir', 'cnt'], + length=10000): + """generate sequential PID/int records, suitable for filling int<->pid maps for + testing swh-graph on-disk binary databases + + Args: + types (list): list of PID types to be generated, specified as the + corresponding 3-letter component in PIDs + length (int): number of PIDs to generate *per type* + + Yields: + pairs (pid, int) where pid is a textual PID and int its sequential + integer identifier + + """ + pos = 0 + for t in sorted(types): + for i in range(0, length): + seq = format(pos, 'x') # current position as hex string + pid = 'swh:1:{}:{}{}'.format(t, '0' * (40 - len(seq)), seq) + yield (pid, pos) + pos += 1 + + +# pairs PID/position in the sequence generated by :func:`gen_records` above +MAP_PAIRS = [ + ('swh:1:cnt:0000000000000000000000000000000000000000', 0), + ('swh:1:cnt:000000000000000000000000000000000000002a', 42), + ('swh:1:dir:0000000000000000000000000000000000002afc', 11004), + ('swh:1:ori:00000000000000000000000000000000000056ce', 22222), + ('swh:1:rel:0000000000000000000000000000000000008235', 33333), + ('swh:1:rev:000000000000000000000000000000000000ad9c', 44444), + ('swh:1:snp:000000000000000000000000000000000000ea5f', 59999), +] + + +class TestPidToIntMap(unittest.TestCase): + + @classmethod + def setUpClass(cls): + """create reasonably sized (~2 MB) PID->int map to test on-disk DB + + """ + cls.tmpdir = tempfile.mkdtemp(prefix='swh.graph.test.') + cls.fname = os.path.join(cls.tmpdir, 'pid2int.bin') + with open(cls.fname, 'wb') as f: + for (pid, i) in gen_records(length=10000): + PidToIntMap.write_record(f, pid, i) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir) + + def setUp(self): + self.map = PidToIntMap(self.fname) + + def tearDown(self): + self.map.close() + + def test_lookup(self): + for (pid, pos) in MAP_PAIRS: + self.assertEqual(self.map[pid], pos) + + def test_missing(self): + with self.assertRaises(KeyError): + self.map['swh:1:ori:0101010100000000000000000000000000000000'], + with self.assertRaises(KeyError): + self.map['swh:1:cnt:0101010100000000000000000000000000000000'], + + def test_type_error(self): + with self.assertRaises(TypeError): + self.map[42] + with self.assertRaises(TypeError): + self.map[1.2] + + +class TestIntToPidMap(unittest.TestCase): + + @classmethod + def setUpClass(cls): + """create reasonably sized (~1 MB) int->PID map to test on-disk DB + + """ + cls.tmpdir = tempfile.mkdtemp(prefix='swh.graph.test.') + cls.fname = os.path.join(cls.tmpdir, 'int2pid.bin') + with open(cls.fname, 'wb') as f: + for (pid, _i) in gen_records(length=10000): + IntToPidMap.write_record(f, pid) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir) + + def setUp(self): + self.map = IntToPidMap(self.fname) + + def tearDown(self): + self.map.close() + + def test_lookup(self): + for (pid, pos) in MAP_PAIRS: + self.assertEqual(self.map[pos], pid) + + def test_out_of_bounds(self): + with self.assertRaises(IndexError): + self.map[1000000] + with self.assertRaises(IndexError): + self.map[-1000000]