diff --git a/swh/graph/cli.py b/swh/graph/cli.py --- a/swh/graph/cli.py +++ b/swh/graph/cli.py @@ -3,9 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging from pathlib import Path -import sys from typing import TYPE_CHECKING, Any, Dict, Set, Tuple # WARNING: do not import unnecessary things here to keep cli startup time under @@ -100,222 +98,6 @@ ctx.obj["config"] = conf -@graph_cli_group.command("api-client") -@click.option("--host", default="localhost", help="Graph server host") -@click.option("--port", default="5009", help="Graph server port") -@click.pass_context -def api_client(ctx, host, port): - """client for the graph RPC service""" - from swh.graph import client - - url = "http://{}:{}".format(host, port) - app = client.RemoteGraphClient(url) - - # TODO: run web app - print(app.stats()) - - -@graph_cli_group.group("map") -@click.pass_context -def map(ctx): - """Manage swh-graph on-disk maps""" - pass - - -def dump_swhid2node(filename): - from swh.graph.swhid import SwhidToNodeMap - - for (swhid, int) in SwhidToNodeMap(filename): - print("{}\t{}".format(swhid, int)) - - -def dump_node2swhid(filename): - from swh.graph.swhid import NodeToSwhidMap - - for (int, swhid) in NodeToSwhidMap(filename): - print("{}\t{}".format(int, swhid)) - - -def restore_swhid2node(filename): - """read a textual SWHID->int map from stdin and write its binary version to - filename - - """ - from swh.graph.swhid import SwhidToNodeMap - - with open(filename, "wb") as dst: - for line in sys.stdin: - (str_swhid, str_int) = line.split() - SwhidToNodeMap.write_record(dst, str_swhid, int(str_int)) - - -def restore_node2swhid(filename, length): - """read a textual int->SWHID map from stdin and write its binary version to - filename - - """ - from swh.graph.swhid import NodeToSwhidMap - - node2swhid = NodeToSwhidMap(filename, mode="wb", length=length) - for line in sys.stdin: - (str_int, str_swhid) = line.split() - node2swhid[int(str_int)] = str_swhid - node2swhid.close() - - -@map.command("dump") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to dump", -) -@click.argument("filename", required=True, type=click.Path(exists=True)) -@click.pass_context -def dump_map(ctx, map_type, filename): - """Dump a binary SWHID<->node map to textual format.""" - if map_type == "swhid2node": - dump_swhid2node(filename) - elif map_type == "node2swhid": - dump_node2swhid(filename) - else: - raise ValueError("invalid map type: " + map_type) - pass - - -@map.command("restore") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to dump", -) -@click.option( - "--length", - "-l", - type=int, - help="""map size in number of logical records - (required for node2swhid maps)""", -) -@click.argument("filename", required=True, type=click.Path()) -@click.pass_context -def restore_map(ctx, map_type, length, filename): - """Restore a binary SWHID<->node map from textual format.""" - if map_type == "swhid2node": - restore_swhid2node(filename) - elif map_type == "node2swhid": - if length is None: - raise click.UsageError( - "map length is required when restoring {} maps".format(map_type), ctx - ) - restore_node2swhid(filename, length) - else: - raise ValueError("invalid map type: " + map_type) - - -@map.command("write") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to write", -) -@click.argument("filename", required=True, type=click.Path()) -@click.pass_context -def write(ctx, map_type, filename): - """Write a map to disk sequentially. - - read from stdin a textual SWHID->node mapping (for swhid2node, or a simple - sequence of SWHIDs for node2swhid) and write it to disk in the requested binary - map format - - note that no sorting is applied, so the input should already be sorted as - required by the chosen map type (by SWHID for swhid2node, by int for node2swhid) - - """ - from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap - - with open(filename, "wb") as f: - if map_type == "swhid2node": - for line in sys.stdin: - (swhid, int_str) = line.rstrip().split(maxsplit=1) - SwhidToNodeMap.write_record(f, swhid, int(int_str)) - elif map_type == "node2swhid": - for line in sys.stdin: - swhid = line.rstrip() - NodeToSwhidMap.write_record(f, swhid) - else: - raise ValueError("invalid map type: " + map_type) - - -@map.command("lookup") -@click.option( - "--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename" -) -@click.argument("identifiers", nargs=-1) -def map_lookup(graph, identifiers): - """Lookup identifiers using on-disk maps. - - Depending on the identifier type lookup either a SWHID into a SWHID->node (and - return the node integer identifier) or, vice-versa, lookup a node integer - identifier into a node->SWHID (and return the SWHID). The desired behavior is - chosen depending on the syntax of each given identifier. - - Identifiers can be passed either directly on the command line or on - standard input, separate by blanks. Logical lines (as returned by - readline()) in stdin will be preserved in stdout. - - """ - from swh.graph.backend import NODE2SWHID_EXT, SWHID2NODE_EXT - from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap - import swh.model.exceptions - from swh.model.swhids import ExtendedSWHID - - success = True # no identifiers failed to be looked up - swhid2node = SwhidToNodeMap(f"{graph}.{SWHID2NODE_EXT}") - node2swhid = NodeToSwhidMap(f"{graph}.{NODE2SWHID_EXT}") - - def lookup(identifier): - nonlocal success, swhid2node, node2swhid - is_swhid = None - try: - int(identifier) - is_swhid = False - except ValueError: - try: - ExtendedSWHID.from_string(identifier) - is_swhid = True - except swh.model.exceptions.ValidationError: - success = False - logging.error(f'invalid identifier: "{identifier}", skipping') - - try: - if is_swhid: - return str(swhid2node[identifier]) - else: - return node2swhid[int(identifier)] - except KeyError: - success = False - logging.error(f'identifier not found: "{identifier}", skipping') - - if identifiers: # lookup identifiers passed via CLI - for identifier in identifiers: - print(lookup(identifier)) - else: # lookup identifiers passed via stdin, preserving logical lines - for line in sys.stdin: - results = [lookup(id) for id in line.rstrip().split()] - if results: # might be empty if all IDs on the same line failed - print(" ".join(results)) - - sys.exit(0 if success else 1) - - @graph_cli_group.command(name="rpc-serve") @click.option( "--host", @@ -407,43 +189,6 @@ webgraph.compress(graph_name, input_dataset, output_directory, steps, conf) -@graph_cli_group.command(name="cachemount") -@click.option( - "--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename" -) -@click.option( - "--cache", - "-c", - default="/dev/shm/swh-graph/default", - metavar="CACHE", - type=PathlibPath(), - help="Memory cache path (defaults to /dev/shm/swh-graph/default)", -) -@click.pass_context -def cachemount(ctx, graph, cache): - """ - Cache the mmapped files of the compressed graph in a tmpfs. - - This command creates a new directory at the path given by CACHE that has - the same structure as the compressed graph basename, except it copies the - files that require mmap access (:file:`{*}.graph`) but uses symlinks from the source - for all the other files (:file:`{*}.map`, :file:`{*}.bin`, ...). - - The command outputs the path to the memory cache directory (particularly - useful when relying on the default value). - """ - import shutil - - cache.mkdir(parents=True) - for src in Path(graph).parent.glob("*"): - dst = cache / src.name - if src.suffix == ".graph": - shutil.copy2(src, dst) - else: - dst.symlink_to(src.resolve()) - print(cache) - - def main(): return graph_cli_group(auto_envvar_prefix="SWH_GRAPH") diff --git a/swh/graph/swhid.py b/swh/graph/swhid.py deleted file mode 100644 --- a/swh/graph/swhid.py +++ /dev/null @@ -1,419 +0,0 @@ -# Copyright (C) 2019-2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from __future__ import annotations - -from collections.abc import MutableMapping -from enum import Enum -import mmap -from mmap import MAP_SHARED, PROT_READ, PROT_WRITE -import os -import struct -from typing import BinaryIO, Iterator, Tuple - -from swh.model.hashutil import hash_to_hex -from swh.model.swhids import ExtendedObjectType, ExtendedSWHID - -SWHID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes -INT_BIN_FMT = ">q" # big endian, 8-byte integer -SWHID_BIN_SIZE = 22 # in bytes -INT_BIN_SIZE = 8 # in bytes - - -class SwhidType(Enum): - """types of existing SWHIDs, used to serialize ExtendedSWHID type as a (char) - integer - - Note that the order does matter also for driving the binary search in - SWHID-indexed maps. Integer values also matter, for compatibility with the - Java layer. - - """ - - content = 0 - directory = 1 - origin = 2 - release = 3 - revision = 4 - snapshot = 5 - - @classmethod - def from_extended_object_type(cls, object_type: ExtendedObjectType) -> SwhidType: - return cls[object_type.name.lower()] - - def to_extended_object_type(self) -> ExtendedObjectType: - return ExtendedObjectType[SwhidType(self).name.upper()] - - -def str_to_bytes(swhid_str: str) -> bytes: - """Convert a SWHID to a byte sequence - - The binary format used to represent SWHIDs as 22-byte long byte sequences as - follows: - - - 1 byte for the namespace version represented as a C `unsigned char` - - 1 byte for the object type, as the int value of :class:`SwhidType` enums, - represented as a C `unsigned char` - - 20 bytes for the SHA1 digest as a byte sequence - - Args: - swhid: persistent identifier - - Returns: - bytes: byte sequence representation of swhid - - """ - swhid = ExtendedSWHID.from_string(swhid_str) - return struct.pack( - SWHID_BIN_FMT, - swhid.scheme_version, - SwhidType.from_extended_object_type(swhid.object_type).value, - swhid.object_id, - ) - - -def bytes_to_str(bytes: bytes) -> str: - """Inverse function of :func:`str_to_bytes` - - See :func:`str_to_bytes` for a description of the binary SWHID format. - - Args: - bytes: byte sequence representation of swhid - - Returns: - swhid: persistent identifier - - """ - (version, type, bin_digest) = struct.unpack(SWHID_BIN_FMT, bytes) - - # The following is equivalent to: - # return str(ExtendedSWHID( - # object_type=SwhidType(type).to_extended_object_type(), object_id=bin_digest - # ) - # but more efficient, because ExtendedSWHID.__init__ is extremely slow. - object_type = ExtendedObjectType[SwhidType(type).name.upper()] - return f"swh:1:{object_type.value}:{hash_to_hex(bin_digest)}" - - -class _OnDiskMap: - """mmap-ed on-disk sequence of fixed size records""" - - def __init__( - self, record_size: int, fname: str, mode: str = "rb", length: int = None - ): - """open an existing on-disk map - - Args: - record_size: size of each record in bytes - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - length: map size in number of logical records; used to initialize - writable maps at creation time. Must be given when mode is 'wb' - and the map doesn't exist on disk; ignored otherwise - - """ - os_modes = {"rb": os.O_RDONLY, "wb": os.O_RDWR | os.O_CREAT, "rb+": os.O_RDWR} - if mode not in os_modes: - raise ValueError("invalid file open mode: " + mode) - new_map = mode == "wb" - writable_map = mode in ["wb", "rb+"] - - self.record_size = record_size - self.fd = os.open(fname, os_modes[mode]) - if new_map: - if length is None: - raise ValueError("missing length when creating new map") - os.truncate(self.fd, length * self.record_size) - - self.size = os.path.getsize(fname) - (self.length, remainder) = divmod(self.size, record_size) - if remainder: - raise ValueError( - "map size {} is not a multiple of the record size {}".format( - self.size, record_size - ) - ) - - self.mm = mmap.mmap( - self.fd, - self.size, - prot=(PROT_READ | PROT_WRITE if writable_map else PROT_READ), - flags=MAP_SHARED, - ) - - def close(self) -> None: - """close the map - - shuts down both the mmap and the underlying file descriptor - - """ - if not self.mm.closed: - self.mm.close() - os.close(self.fd) - - def __len__(self) -> int: - return self.length - - def __delitem__(self, pos: int) -> None: - raise NotImplementedError("cannot delete records from fixed-size map") - - -class SwhidToNodeMap(_OnDiskMap, MutableMapping): - """memory mapped map from :ref:`SWHIDs ` to a - continuous range 0..N of (8-byte long) integers - - This is the converse mapping of :class:`NodeToSwhidMap`. - - The on-disk serialization format is a sequence of fixed length (30 bytes) - records with the following fields: - - - SWHID (22 bytes): binary SWHID representation as per :func:`str_to_bytes` - - long (8 bytes): big endian long integer - - The records are sorted lexicographically by SWHID type and checksum, where - type is the integer value of :class:`SwhidType`. SWHID lookup in the map is - performed via binary search. Hence a huge map with, say, 11 B entries, - will require ~30 disk seeks. - - Note that, due to fixed size + ordering, it is not possible to create these - maps by random writing. Hence, __setitem__ can be used only to *update* the - value associated to an existing key, rather than to add a missing item. To - create an entire map from scratch, you should do so *sequentially*, using - static method :meth:`write_record` (or, at your own risk, by hand via the - mmap :attr:`mm`). - - """ - - # record binary format: SWHID + a big endian 8-byte big endian integer - RECORD_BIN_FMT = ">" + SWHID_BIN_FMT + "q" - RECORD_SIZE = SWHID_BIN_SIZE + INT_BIN_SIZE - - def __init__(self, fname: str, mode: str = "rb", length: int = None): - """open an existing on-disk map - - Args: - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - length: map size in number of logical records; used to initialize - read-write maps at creation time. Must be given when mode is - 'wb'; ignored otherwise - - """ - super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) - - def _get_bin_record(self, pos: int) -> Tuple[bytes, bytes]: - """seek and return the (binary) record at a given (logical) position - - see :func:`_get_record` for an equivalent function with additional - deserialization - - Args: - pos: 0-based record number - - Returns: - a pair `(swhid, int)`, where swhid and int are bytes - - """ - rec_pos = pos * self.RECORD_SIZE - int_pos = rec_pos + SWHID_BIN_SIZE - - return (self.mm[rec_pos:int_pos], self.mm[int_pos : int_pos + INT_BIN_SIZE]) - - def _get_record(self, pos: int) -> Tuple[str, int]: - """seek and return the record at a given (logical) position - - moral equivalent of :func:`_get_bin_record`, with additional - deserialization to non-bytes types - - Args: - pos: 0-based record number - - Returns: - a pair `(swhid, int)`, where swhid is a string-based SWHID and int the - corresponding integer identifier - - """ - (swhid_bytes, int_bytes) = self._get_bin_record(pos) - return (bytes_to_str(swhid_bytes), struct.unpack(INT_BIN_FMT, int_bytes)[0]) - - @classmethod - def write_record(cls, f: BinaryIO, swhid: str, int: int) -> None: - """write a logical record to a file-like object - - Args: - f: file-like object to write the record to - swhid: textual SWHID - int: SWHID integer identifier - - """ - f.write(str_to_bytes(swhid)) - f.write(struct.pack(INT_BIN_FMT, int)) - - def _bisect_pos(self, swhid_str: str) -> int: - """bisect the position of the given identifier. If the identifier is - not found, the position of the swhid immediately after is returned. - - Args: - swhid_str: the swhid as a string - - Returns: - the logical record of the bisected position in the map - - """ - if not isinstance(swhid_str, str): - raise TypeError("SWHID must be a str, not {}".format(type(swhid_str))) - try: - target = str_to_bytes(swhid_str) # desired SWHID as bytes - except ValueError: - raise ValueError('invalid SWHID: "{}"'.format(swhid_str)) - - lo = 0 - hi = self.length - 1 - while lo < hi: - mid = (lo + hi) // 2 - (swhid, _value) = self._get_bin_record(mid) - if swhid < target: - lo = mid + 1 - else: - hi = mid - return lo - - def _find(self, swhid_str: str) -> Tuple[int, int]: - """lookup the integer identifier of a swhid and its position - - Args: - swhid_str: the swhid as a string - - Returns: - a pair `(swhid, pos)` with swhid integer identifier and its logical - record position in the map - - """ - pos = self._bisect_pos(swhid_str) - swhid_found, value = self._get_record(pos) - if swhid_found == swhid_str: - return (value, pos) - raise KeyError(swhid_str) - - def __getitem__(self, swhid_str: str) -> int: - """lookup the integer identifier of a SWHID - - Args: - swhid: the SWHID as a string - - Returns: - the integer identifier of swhid - - """ - return self._find(swhid_str)[0] # return element, ignore position - - def __setitem__(self, swhid_str: str, int: str) -> None: - (_swhid, pos) = self._find(swhid_str) # might raise KeyError and that's OK - - rec_pos = pos * self.RECORD_SIZE - int_pos = rec_pos + SWHID_BIN_SIZE - self.mm[rec_pos:int_pos] = str_to_bytes(swhid_str) - self.mm[int_pos : int_pos + INT_BIN_SIZE] = struct.pack(INT_BIN_FMT, int) - - def __iter__(self) -> Iterator[Tuple[str, int]]: - for pos in range(self.length): - yield self._get_record(pos) - - def iter_prefix(self, prefix: str): - swh, n, t, sha = prefix.split(":") - sha = sha.ljust(40, "0") - start_swhid = ":".join([swh, n, t, sha]) - start = self._bisect_pos(start_swhid) - for pos in range(start, self.length): - swhid, value = self._get_record(pos) - if not swhid.startswith(prefix): - break - yield swhid, value - - def iter_type(self, swhid_type: str) -> Iterator[Tuple[str, int]]: - prefix = "swh:1:{}:".format(swhid_type) - yield from self.iter_prefix(prefix) - - -class NodeToSwhidMap(_OnDiskMap, MutableMapping): - """memory mapped map from a continuous range of 0..N (8-byte long) integers to - :ref:`SWHIDs ` - - This is the converse mapping of :class:`SwhidToNodeMap`. - - The on-disk serialization format is a sequence of fixed length records (22 - bytes), each being the binary representation of a SWHID as per - :func:`str_to_bytes`. - - The records are sorted by long integer, so that integer lookup is possible - via fixed-offset seek. - - """ - - RECORD_BIN_FMT = SWHID_BIN_FMT - RECORD_SIZE = SWHID_BIN_SIZE - - def __init__(self, fname: str, mode: str = "rb", length: int = None): - """open an existing on-disk map - - Args: - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - size: map size in number of logical records; used to initialize - read-write maps at creation time. Must be given when mode is - 'wb'; ignored otherwise - length: passed to :class:`_OnDiskMap` - - """ - - super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) - - def _get_bin_record(self, pos: int) -> bytes: - """seek and return the (binary) SWHID at a given (logical) position - - Args: - pos: 0-based record number - - Returns: - SWHID as a byte sequence - - """ - rec_pos = pos * self.RECORD_SIZE - - return self.mm[rec_pos : rec_pos + self.RECORD_SIZE] - - @classmethod - def write_record(cls, f: BinaryIO, swhid: str) -> None: - """write a SWHID to a file-like object - - Args: - f: file-like object to write the record to - swhid: textual SWHID - - """ - f.write(str_to_bytes(swhid)) - - def __getitem__(self, pos: int) -> str: - orig_pos = pos - if pos < 0: - pos = len(self) + pos - if not (0 <= pos < len(self)): - raise IndexError(orig_pos) - - return bytes_to_str(self._get_bin_record(pos)) - - def __setitem__(self, pos: int, swhid: str) -> None: - rec_pos = pos * self.RECORD_SIZE - self.mm[rec_pos : rec_pos + self.RECORD_SIZE] = str_to_bytes(swhid) - - def __iter__(self) -> Iterator[Tuple[int, str]]: - for pos in range(self.length): - yield (pos, self[pos]) diff --git a/swh/graph/tests/test_swhid.py b/swh/graph/tests/test_swhid.py deleted file mode 100644 --- a/swh/graph/tests/test_swhid.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from itertools import islice -import os -import shutil -import tempfile -import unittest - -from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap, bytes_to_str, str_to_bytes -from swh.model.swhids import SWHID_TYPES - - -class TestSwhidSerialization(unittest.TestCase): - - pairs = [ - ( - "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - bytes.fromhex("01" + "00" + "94a9ed024d3859793618152ea559a168bbcbb5e2"), - ), - ( - "swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505", - bytes.fromhex("01" + "01" + "d198bc9d7a6bcf6db04f476d29314f157507d505"), - ), - ( - "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f", - bytes.fromhex("01" + "02" + "b63a575fe3faab7692c9f38fb09d4bb45651bb0f"), - ), - ( - "swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f", - bytes.fromhex("01" + "03" + "22ece559cc7cc2364edc5e5593d63ae8bd229f9f"), - ), - ( - "swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d", - bytes.fromhex("01" + "04" + "309cf2674ee7a0749978cf8265ab91a60aea0f7d"), - ), - ( - "swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453", - bytes.fromhex("01" + "05" + "c7c108084bc0bf3d81436bf980b46e98bd338453"), - ), - ] - - def test_str_to_bytes(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(str_to_bytes(swhid_str), swhid_bytes) - - def test_bytes_to_str(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(bytes_to_str(swhid_bytes), swhid_str) - - def test_round_trip(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(swhid_str, bytes_to_str(str_to_bytes(swhid_str))) - self.assertEqual(swhid_bytes, str_to_bytes(bytes_to_str(swhid_bytes))) - - -def gen_records(types=["cnt", "dir", "ori", "rel", "rev", "snp"], length=10000): - """generate sequential SWHID/int records, suitable for filling int<->swhid maps for - testing swh-graph on-disk binary databases - - Args: - types (list): list of SWHID types to be generated, specified as the - corresponding 3-letter component in SWHIDs - length (int): number of SWHIDs to generate *per type* - - Yields: - pairs (swhid, int) where swhid is a textual SWHID and int its sequential - integer identifier - - """ - pos = 0 - for t in sorted(types): - for i in range(0, length): - seq = format(pos, "x") # current position as hex string - swhid = "swh:1:{}:{}{}".format(t, "0" * (40 - len(seq)), seq) - yield (swhid, pos) - pos += 1 - - -# pairs SWHID/position in the sequence generated by :func:`gen_records` above -MAP_PAIRS = [ - ("swh:1:cnt:0000000000000000000000000000000000000000", 0), - ("swh:1:cnt:000000000000000000000000000000000000002a", 42), - ("swh:1:dir:0000000000000000000000000000000000002afc", 11004), - ("swh:1:ori:00000000000000000000000000000000000056ce", 22222), - ("swh:1:rel:0000000000000000000000000000000000008235", 33333), - ("swh:1:rev:000000000000000000000000000000000000ad9c", 44444), - ("swh:1:snp:000000000000000000000000000000000000ea5f", 59999), -] - - -class TestSwhidToNodeMap(unittest.TestCase): - @classmethod - def setUpClass(cls): - """create reasonably sized (~2 MB) SWHID->int map to test on-disk DB""" - cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") - cls.fname = os.path.join(cls.tmpdir, "swhid2int.bin") - with open(cls.fname, "wb") as f: - for (swhid, i) in gen_records(length=10000): - SwhidToNodeMap.write_record(f, swhid, i) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdir) - - def setUp(self): - self.map = SwhidToNodeMap(self.fname) - - def tearDown(self): - self.map.close() - - def test_lookup(self): - for (swhid, pos) in MAP_PAIRS: - self.assertEqual(self.map[swhid], pos) - - def test_missing(self): - with self.assertRaises(KeyError): - self.map["swh:1:ori:0101010100000000000000000000000000000000"], - with self.assertRaises(KeyError): - self.map["swh:1:cnt:0101010100000000000000000000000000000000"], - - def test_type_error(self): - with self.assertRaises(TypeError): - self.map[42] - with self.assertRaises(TypeError): - self.map[1.2] - - def test_update(self): - fname2 = self.fname + ".update" - shutil.copy(self.fname, fname2) # fresh map copy - map2 = SwhidToNodeMap(fname2, mode="rb+") - for (swhid, int) in islice(map2, 11): # update the first N items - new_int = int + 42 - map2[swhid] = new_int - self.assertEqual(map2[swhid], new_int) # check updated value - - os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this - - def test_iter_type(self): - for t in SWHID_TYPES + ["ori"]: - first_20 = list(islice(self.map.iter_type(t), 20)) - k = first_20[0][1] - expected = [("swh:1:{}:{:040x}".format(t, i), i) for i in range(k, k + 20)] - assert first_20 == expected - - def test_iter_prefix(self): - for t in SWHID_TYPES + ["ori"]: - prefix = self.map.iter_prefix("swh:1:{}:00".format(t)) - first_20 = list(islice(prefix, 20)) - k = first_20[0][1] - expected = [("swh:1:{}:{:040x}".format(t, i), i) for i in range(k, k + 20)] - assert first_20 == expected - - -class TestNodeToSwhidMap(unittest.TestCase): - @classmethod - def setUpClass(cls): - """create reasonably sized (~1 MB) int->SWHID map to test on-disk DB""" - cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") - cls.fname = os.path.join(cls.tmpdir, "int2swhid.bin") - with open(cls.fname, "wb") as f: - for (swhid, _i) in gen_records(length=10000): - NodeToSwhidMap.write_record(f, swhid) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdir) - - def setUp(self): - self.map = NodeToSwhidMap(self.fname) - - def tearDown(self): - self.map.close() - - def test_lookup(self): - for (swhid, pos) in MAP_PAIRS: - self.assertEqual(self.map[pos], swhid) - - def test_out_of_bounds(self): - with self.assertRaises(IndexError): - self.map[1000000] - with self.assertRaises(IndexError): - self.map[-1000000] - - def test_update(self): - fname2 = self.fname + ".update" - shutil.copy(self.fname, fname2) # fresh map copy - map2 = NodeToSwhidMap(fname2, mode="rb+") - for (int, swhid) in islice(map2, 11): # update the first N items - new_swhid = swhid.replace(":0", ":f") # mangle first hex digit - map2[int] = new_swhid - self.assertEqual(map2[int], new_swhid) # check updated value - - os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this