diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py index 9c0225bf..d9f92e7c 100644 --- a/swh/storage/backfill.py +++ b/swh/storage/backfill.py @@ -1,521 +1,521 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Storage backfiller. The backfiller goal is to produce back part or all of the objects from a storage to the journal topics Current implementation consists in the JournalBackfiller class. It simply reads the objects from the storage and sends every object identifier back to the journal. """ import logging from swh.core.db import BaseDb from swh.journal.writer.kafka import KafkaJournalWriter from swh.storage.converters import ( db_to_raw_extrinsic_metadata, db_to_release, db_to_revision, ) from swh.storage.replay import object_converter_fn logger = logging.getLogger(__name__) PARTITION_KEY = { "content": "sha1", "skipped_content": "sha1", "directory": "id", "metadata_authority": "type, url", "metadata_fetcher": "name, version", "raw_extrinsic_metadata": "id", "revision": "revision.id", "release": "release.id", "snapshot": "id", "origin": "id", "origin_visit": "origin_visit.origin", "origin_visit_status": "origin_visit_status.origin", } COLUMNS = { "content": [ "sha1", "sha1_git", "sha256", "blake2s256", "length", "status", "ctime", ], "skipped_content": [ "sha1", "sha1_git", "sha256", "blake2s256", "length", "ctime", "status", "reason", ], "directory": ["id", "dir_entries", "file_entries", "rev_entries"], "metadata_authority": ["type", "url", "metadata",], "metadata_fetcher": ["name", "version", "metadata",], "raw_extrinsic_metadata": [ "raw_extrinsic_metadata.type", "raw_extrinsic_metadata.id", "metadata_authority.type", "metadata_authority.url", "metadata_fetcher.name", "metadata_fetcher.version", "discovery_date", "format", "raw_extrinsic_metadata.metadata", "origin", "visit", "snapshot", "release", "revision", "path", "directory", ], "revision": [ ("revision.id", "id"), "date", "date_offset", "date_neg_utc_offset", "committer_date", "committer_date_offset", "committer_date_neg_utc_offset", "type", "directory", "message", "synthetic", "metadata", ( "array(select parent_id::bytea from revision_history rh " "where rh.id = revision.id order by rh.parent_rank asc)", "parents", ), ("a.id", "author_id"), ("a.name", "author_name"), ("a.email", "author_email"), ("a.fullname", "author_fullname"), ("c.id", "committer_id"), ("c.name", "committer_name"), ("c.email", "committer_email"), ("c.fullname", "committer_fullname"), ], "release": [ ("release.id", "id"), "date", "date_offset", "date_neg_utc_offset", "comment", ("release.name", "name"), "synthetic", "target", "target_type", ("a.id", "author_id"), ("a.name", "author_name"), ("a.email", "author_email"), ("a.fullname", "author_fullname"), ], "snapshot": ["id", "object_id"], "origin": ["url"], "origin_visit": ["visit", "type", ("origin.url", "origin"), "date",], "origin_visit_status": [ "visit", ("origin.url", "origin"), "date", "snapshot", "status", "metadata", ], } JOINS = { "release": ["person a on release.author=a.id"], "revision": [ "person a on revision.author=a.id", "person c on revision.committer=c.id", ], "origin_visit": ["origin on origin_visit.origin=origin.id"], "origin_visit_status": ["origin on origin_visit_status.origin=origin.id"], "raw_extrinsic_metadata": [ "metadata_authority on " "raw_extrinsic_metadata.authority_id=metadata_authority.id", "metadata_fetcher on raw_extrinsic_metadata.fetcher_id=metadata_fetcher.id", ], } def directory_converter(db, directory): """Convert directory from the flat representation to swh model compatible objects. """ columns = ["target", "name", "perms"] query_template = """ select %(columns)s from directory_entry_%(type)s where id in %%s """ types = ["file", "dir", "rev"] entries = [] with db.cursor() as cur: for type in types: ids = directory.pop("%s_entries" % type) if not ids: continue query = query_template % { "columns": ",".join(columns), "type": type, } cur.execute(query, (tuple(ids),)) for row in cur: entry = dict(zip(columns, row)) entry["type"] = type entries.append(entry) directory["entries"] = entries return directory def raw_extrinsic_metadata_converter(db, metadata): """Convert revision from the flat representation to swh model compatible objects. """ return db_to_raw_extrinsic_metadata(metadata).to_dict() def revision_converter(db, revision): """Convert revision from the flat representation to swh model compatible objects. """ - return db_to_revision(revision) + return db_to_revision(revision).to_dict() def release_converter(db, release): """Convert release from the flat representation to swh model compatible objects. """ - return db_to_release(release) + return db_to_release(release).to_dict() def snapshot_converter(db, snapshot): """Convert snapshot from the flat representation to swh model compatible objects. """ columns = ["name", "target", "target_type"] query = """ select %s from snapshot_branches sbs inner join snapshot_branch sb on sb.object_id=sbs.branch_id where sbs.snapshot_id=%%s """ % ", ".join( columns ) with db.cursor() as cur: cur.execute(query, (snapshot.pop("object_id"),)) branches = {} for name, *row in cur: branch = dict(zip(columns[1:], row)) if not branch["target"] and not branch["target_type"]: branch = None branches[name] = branch snapshot["branches"] = branches return snapshot CONVERTERS = { "directory": directory_converter, "raw_extrinsic_metadata": raw_extrinsic_metadata_converter, "revision": revision_converter, "release": release_converter, "snapshot": snapshot_converter, } def object_to_offset(object_id, numbits): """Compute the index of the range containing object id, when dividing space into 2^numbits. Args: object_id (str): The hex representation of object_id numbits (int): Number of bits in which we divide input space Returns: The index of the range containing object id """ q, r = divmod(numbits, 8) length = q + (r != 0) shift_bits = 8 - r if r else 0 truncated_id = object_id[: length * 2] if len(truncated_id) < length * 2: truncated_id += "0" * (length * 2 - len(truncated_id)) truncated_id_bytes = bytes.fromhex(truncated_id) return int.from_bytes(truncated_id_bytes, byteorder="big") >> shift_bits def byte_ranges(numbits, start_object=None, end_object=None): """Generate start/end pairs of bytes spanning numbits bits and constrained by optional start_object and end_object. Args: numbits (int): Number of bits in which we divide input space start_object (str): Hex object id contained in the first range returned end_object (str): Hex object id contained in the last range returned Yields: 2^numbits pairs of bytes """ q, r = divmod(numbits, 8) length = q + (r != 0) shift_bits = 8 - r if r else 0 def to_bytes(i): return int.to_bytes(i << shift_bits, length=length, byteorder="big") start_offset = 0 end_offset = 1 << numbits if start_object is not None: start_offset = object_to_offset(start_object, numbits) if end_object is not None: end_offset = object_to_offset(end_object, numbits) + 1 for start in range(start_offset, end_offset): end = start + 1 if start == 0: yield None, to_bytes(end) elif end == 1 << numbits: yield to_bytes(start), None else: yield to_bytes(start), to_bytes(end) def integer_ranges(start, end, block_size=1000): for start in range(start, end, block_size): if start == 0: yield None, block_size elif start + block_size > end: yield start, end else: yield start, start + block_size RANGE_GENERATORS = { "content": lambda start, end: byte_ranges(24, start, end), "skipped_content": lambda start, end: [(None, None)], "directory": lambda start, end: byte_ranges(24, start, end), "revision": lambda start, end: byte_ranges(24, start, end), "release": lambda start, end: byte_ranges(16, start, end), "snapshot": lambda start, end: byte_ranges(16, start, end), "origin": integer_ranges, "origin_visit": integer_ranges, "origin_visit_status": integer_ranges, } def compute_query(obj_type, start, end): columns = COLUMNS.get(obj_type) join_specs = JOINS.get(obj_type, []) join_clause = "\n".join("left join %s" % clause for clause in join_specs) where = [] where_args = [] if start: where.append("%(keys)s >= %%s") where_args.append(start) if end: where.append("%(keys)s < %%s") where_args.append(end) where_clause = "" if where: where_clause = ("where " + " and ".join(where)) % { "keys": "(%s)" % PARTITION_KEY[obj_type] } column_specs = [] column_aliases = [] for column in columns: if isinstance(column, str): column_specs.append(column) column_aliases.append(column) else: column_specs.append("%s as %s" % column) column_aliases.append(column[1]) query = """ select %(columns)s from %(table)s %(join)s %(where)s """ % { "columns": ",".join(column_specs), "table": obj_type, "join": join_clause, "where": where_clause, } return query, where_args, column_aliases def fetch(db, obj_type, start, end): """Fetch all obj_type's identifiers from db. This opens one connection, stream objects and when done, close the connection. Args: db (BaseDb): Db connection object obj_type (str): Object type start (Union[bytes|Tuple]): Range start identifier end (Union[bytes|Tuple]): Range end identifier Raises: ValueError if obj_type is not supported Yields: Objects in the given range """ query, where_args, column_aliases = compute_query(obj_type, start, end) converter = CONVERTERS.get(obj_type) with db.cursor() as cursor: logger.debug("Fetching data for table %s", obj_type) logger.debug("query: %s %s", query, where_args) cursor.execute(query, where_args) for row in cursor: record = dict(zip(column_aliases, row)) if converter: record = converter(db, record) logger.debug("record: %s" % record) yield record def _format_range_bound(bound): if isinstance(bound, bytes): return bound.hex() else: return str(bound) MANDATORY_KEYS = ["brokers", "storage_dbconn", "prefix", "client_id"] class JournalBackfiller: """Class in charge of reading the storage's objects and sends those back to the journal's topics. This is designed to be run periodically. """ def __init__(self, config=None): self.config = config self.check_config(config) def check_config(self, config): missing_keys = [] for key in MANDATORY_KEYS: if not config.get(key): missing_keys.append(key) if missing_keys: raise ValueError( "Configuration error: The following keys must be" " provided: %s" % (",".join(missing_keys),) ) def parse_arguments(self, object_type, start_object, end_object): """Parse arguments Raises: ValueError for unsupported object type ValueError if object ids are not parseable Returns: Parsed start and end object ids """ if object_type not in COLUMNS: raise ValueError( "Object type %s is not supported. " "The only possible values are %s" % (object_type, ", ".join(COLUMNS.keys())) ) if object_type in ["origin", "origin_visit"]: if start_object: start_object = int(start_object) else: start_object = 0 if end_object: end_object = int(end_object) else: end_object = 100 * 1000 * 1000 # hard-coded limit return start_object, end_object def run(self, object_type, start_object, end_object, dry_run=False): """Reads storage's subscribed object types and send them to the journal's reading topic. """ start_object, end_object = self.parse_arguments( object_type, start_object, end_object ) db = BaseDb.connect(self.config["storage_dbconn"]) writer = KafkaJournalWriter( brokers=self.config["brokers"], prefix=self.config["prefix"], client_id=self.config["client_id"], ) for range_start, range_end in RANGE_GENERATORS[object_type]( start_object, end_object ): logger.info( "Processing %s range %s to %s", object_type, _format_range_bound(range_start), _format_range_bound(range_end), ) for obj_d in fetch(db, object_type, start=range_start, end=range_end,): obj = object_converter_fn[object_type](obj_d) if dry_run: continue writer.write_addition(object_type=object_type, object_=obj) writer.producer.flush() if __name__ == "__main__": print('Please use the "swh-journal backfiller run" command') diff --git a/swh/storage/converters.py b/swh/storage/converters.py index 7773b2cd..8c3cbe79 100644 --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -1,331 +1,322 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from typing import Any, Optional, Dict from swh.core.utils import encode_with_unescape -from swh.model import identifiers from swh.model.identifiers import parse_swhid from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + ObjectType, + Person, RawExtrinsicMetadata, + Release, + Revision, + RevisionType, + Timestamp, + TimestampWithTimezone, ) from swh.model.hashutil import MultiHash from .utils import map_optional DEFAULT_AUTHOR = { "fullname": None, "name": None, "email": None, } DEFAULT_DATE = { "timestamp": None, "offset": 0, "neg_utc_offset": None, } -def author_to_db(author): +def author_to_db(author: Optional[Person]) -> Dict[str, Any]: """Convert a swh-model author to its DB representation. Args: author: a :mod:`swh.model` compatible author Returns: dict: a dictionary with three keys: author, fullname and email """ if author is None: return DEFAULT_AUTHOR - return author + return author.to_dict() def db_to_author( fullname: Optional[bytes], name: Optional[bytes], email: Optional[bytes] -) -> Optional[Dict[str, Optional[bytes]]]: +) -> Optional[Person]: """Convert the DB representation of an author to a swh-model author. Args: fullname (bytes): the author's fullname name (bytes): the author's name email (bytes): the author's email Returns: - a dictionary with three keys (fullname, name and email), or - None if all the arguments are None. + a Person object, or None if 'fullname' is None. """ - if (fullname, name, email) == (None, None, None): + if fullname is None: return None - return { - "fullname": fullname, - "name": name, - "email": email, - } + return Person(fullname=fullname, name=name, email=email,) def db_to_git_headers(db_git_headers): ret = [] for key, value in db_git_headers: ret.append([key.encode("utf-8"), encode_with_unescape(value)]) return ret -def db_to_date(date, offset, neg_utc_offset): +def db_to_date( + date: Optional[datetime.datetime], offset: int, neg_utc_offset: bool +) -> Optional[TimestampWithTimezone]: """Convert the DB representation of a date to a swh-model compatible date. Args: - date (datetime.datetime): a date pulled out of the database - offset (int): an integer number of minutes representing an UTC offset - neg_utc_offset (boolean): whether an utc offset is negative + date: a date pulled out of the database + offset: an integer number of minutes representing an UTC offset + neg_utc_offset: whether an utc offset is negative Returns: - dict: a dict with three keys: - - - timestamp: a timestamp from UTC - - offset: the number of minutes since UTC - - negative_utc: whether a null UTC offset is negative + a TimestampWithTimezone, or None if the date is None. """ if date is None: return None - return { - "timestamp": { - "seconds": int(date.timestamp()), - "microseconds": date.microsecond, - }, - "offset": offset, - "negative_utc": neg_utc_offset, - } + return TimestampWithTimezone( + timestamp=Timestamp( + seconds=int(date.timestamp()), microseconds=date.microsecond, + ), + offset=offset, + negative_utc=neg_utc_offset, + ) -def date_to_db(date_offset): +def date_to_db(ts_with_tz: Optional[TimestampWithTimezone]) -> Dict[str, Any]: """Convert a swh-model date_offset to its DB representation. Args: - date_offset: a :mod:`swh.model` compatible date_offset + ts_with_tz: a TimestampWithTimezone object Returns: dict: a dictionary with three keys: - timestamp: a date in ISO format - offset: the UTC offset in minutes - neg_utc_offset: a boolean indicating whether a null offset is negative or positive. """ - if date_offset is None: + if ts_with_tz is None: return DEFAULT_DATE - normalized = identifiers.normalize_timestamp(date_offset) - - ts = normalized["timestamp"] - seconds = ts.get("seconds", 0) - microseconds = ts.get("microseconds", 0) + ts = ts_with_tz.timestamp - timestamp = datetime.datetime.fromtimestamp(seconds, datetime.timezone.utc) - timestamp = timestamp.replace(microsecond=microseconds) + timestamp = datetime.datetime.fromtimestamp(ts.seconds, datetime.timezone.utc) + timestamp = timestamp.replace(microsecond=ts.microseconds) return { # PostgreSQL supports isoformatted timestamps "timestamp": timestamp.isoformat(), - "offset": normalized["offset"], - "neg_utc_offset": normalized["negative_utc"], + "offset": ts_with_tz.offset, + "neg_utc_offset": ts_with_tz.negative_utc, } -def revision_to_db(rev): +def revision_to_db(revision: Revision) -> Dict[str, Any]: """Convert a swh-model revision to its database representation. """ - revision = rev.to_dict() - author = author_to_db(revision["author"]) - date = date_to_db(revision["date"]) - committer = author_to_db(revision["committer"]) - committer_date = date_to_db(revision["committer_date"]) + author = author_to_db(revision.author) + date = date_to_db(revision.date) + committer = author_to_db(revision.committer) + committer_date = date_to_db(revision.committer_date) return { - "id": revision["id"], + "id": revision.id, "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], "committer_fullname": committer["fullname"], "committer_name": committer["name"], "committer_email": committer["email"], "committer_date": committer_date["timestamp"], "committer_date_offset": committer_date["offset"], "committer_date_neg_utc_offset": committer_date["neg_utc_offset"], - "type": revision["type"], - "directory": revision["directory"], - "message": revision["message"], - "metadata": revision["metadata"], - "synthetic": revision["synthetic"], - "extra_headers": revision["extra_headers"], + "type": revision.type.value, + "directory": revision.directory, + "message": revision.message, + "metadata": None if revision.metadata is None else dict(revision.metadata), + "synthetic": revision.synthetic, + "extra_headers": revision.extra_headers, "parents": [ - {"id": revision["id"], "parent_id": parent, "parent_rank": i,} - for i, parent in enumerate(revision["parents"]) + {"id": revision.id, "parent_id": parent, "parent_rank": i,} + for i, parent in enumerate(revision.parents) ], } -def db_to_revision(db_revision: Dict[str, Any]) -> Dict[str, Any]: +def db_to_revision(db_revision: Dict[str, Any]) -> Optional[Revision]: """Convert a database representation of a revision to its swh-model representation.""" + if db_revision["type"] is None: + assert all( + v is None for (k, v) in db_revision.items() if k not in ("id", "parents") + ) + return None author = db_to_author( db_revision["author_fullname"], db_revision["author_name"], db_revision["author_email"], ) date = db_to_date( db_revision["date"], db_revision["date_offset"], db_revision["date_neg_utc_offset"], ) committer = db_to_author( db_revision["committer_fullname"], db_revision["committer_name"], db_revision["committer_email"], ) committer_date = db_to_date( db_revision["committer_date"], db_revision["committer_date_offset"], db_revision["committer_date_neg_utc_offset"], ) + assert author, "author is None" + assert committer, "committer is None" + parents = [] if "parents" in db_revision: for parent in db_revision["parents"]: if parent: parents.append(parent) metadata = db_revision["metadata"] extra_headers = db_revision.get("extra_headers", ()) if not extra_headers and metadata and "extra_headers" in metadata: extra_headers = db_to_git_headers(metadata.pop("extra_headers")) - ret = { - "id": db_revision["id"], - "author": author, - "date": date, - "committer": committer, - "committer_date": committer_date, - "type": db_revision["type"], - "directory": db_revision["directory"], - "message": db_revision["message"], - "metadata": metadata, - "synthetic": db_revision["synthetic"], - "extra_headers": extra_headers, - "parents": parents, - } - - if "object_id" in db_revision: - ret["object_id"] = db_revision["object_id"] - - return ret + return Revision( + id=db_revision["id"], + author=author, + date=date, + committer=committer, + committer_date=committer_date, + type=RevisionType(db_revision["type"]), + directory=db_revision["directory"], + message=db_revision["message"], + metadata=metadata, + synthetic=db_revision["synthetic"], + extra_headers=extra_headers, + parents=tuple(parents), + ) -def release_to_db(rel): +def release_to_db(release: Release) -> Dict[str, Any]: """Convert a swh-model release to its database representation. """ - - release = rel.to_dict() - - author = author_to_db(release["author"]) - date = date_to_db(release["date"]) + author = author_to_db(release.author) + date = date_to_db(release.date) return { - "id": release["id"], + "id": release.id, "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], - "name": release["name"], - "target": release["target"], - "target_type": release["target_type"], - "comment": release["message"], - "synthetic": release["synthetic"], + "name": release.name, + "target": release.target, + "target_type": release.target_type.value, + "comment": release.message, + "synthetic": release.synthetic, } -def db_to_release(db_release): +def db_to_release(db_release: Dict[str, Any]) -> Optional[Release]: """Convert a database representation of a release to its swh-model representation. """ + if db_release["target_type"] is None: + assert all(v is None for (k, v) in db_release.items() if k != "id") + return None author = db_to_author( db_release["author_fullname"], db_release["author_name"], db_release["author_email"], ) date = db_to_date( db_release["date"], db_release["date_offset"], db_release["date_neg_utc_offset"] ) - ret = { - "author": author, - "date": date, - "id": db_release["id"], - "name": db_release["name"], - "message": db_release["comment"], - "synthetic": db_release["synthetic"], - "target": db_release["target"], - "target_type": db_release["target_type"], - } - - if "object_id" in db_release: - ret["object_id"] = db_release["object_id"] - - return ret + return Release( + author=author, + date=date, + id=db_release["id"], + name=db_release["name"], + message=db_release["comment"], + synthetic=db_release["synthetic"], + target=db_release["target"], + target_type=ObjectType(db_release["target_type"]), + ) def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata: type_ = MetadataTargetType(row["raw_extrinsic_metadata.type"]) id_ = row["raw_extrinsic_metadata.id"] if type_ != MetadataTargetType.ORIGIN: id_ = parse_swhid(id_) return RawExtrinsicMetadata( type=type_, id=id_, authority=MetadataAuthority( type=MetadataAuthorityType(row["metadata_authority.type"]), url=row["metadata_authority.url"], ), fetcher=MetadataFetcher( name=row["metadata_fetcher.name"], version=row["metadata_fetcher.version"], ), discovery_date=row["discovery_date"], format=row["format"], metadata=row["raw_extrinsic_metadata.metadata"], origin=row["origin"], visit=row["visit"], snapshot=map_optional(parse_swhid, row["snapshot"]), release=map_optional(parse_swhid, row["release"]), revision=map_optional(parse_swhid, row["revision"]), path=row["path"], directory=map_optional(parse_swhid, row["directory"]), ) -def origin_url_to_sha1(origin_url): +def origin_url_to_sha1(origin_url: str) -> bytes: """Convert an origin URL to a sha1. Encodes URL to utf-8.""" return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"] diff --git a/swh/storage/storage.py b/swh/storage/storage.py index ab262cf8..8af3ac68 100644 --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1,1434 +1,1434 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import contextlib import datetime import itertools from collections import defaultdict from contextlib import contextmanager from typing import ( Any, Counter, Dict, Iterable, List, Optional, Tuple, Union, ) import attr import psycopg2 import psycopg2.pool import psycopg2.errors from swh.core.api.serializers import msgpack_loads, msgpack_dumps from swh.model.identifiers import SWHID from swh.model.model import ( Content, Directory, Origin, OriginVisit, OriginVisitStatus, Revision, Release, SkippedContent, Sha1, Sha1Git, Snapshot, SnapshotBranch, TargetType, SHA1_SIZE, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, ) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.storage.interface import ( ListOrder, PagedResult, PartialBranches, VISIT_STATUSES, ) from swh.storage.objstorage import ObjStorage from swh.storage.utils import now from . import converters from .common import db_transaction_generator, db_transaction from .db import Db from .exc import StorageArgumentException, StorageDBError, HashCollision from .algos import diff from .metrics import timed, send_metric, process_metrics from .utils import get_partition_bounds_bytes, extract_collision_hash, map_optional from .writer import JournalWriter # Max block size of contents to return BULK_BLOCK_CONTENT_LEN_MAX = 10000 EMPTY_SNAPSHOT_ID = hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e") """Identifier for the empty snapshot""" VALIDATION_EXCEPTIONS = ( KeyError, TypeError, ValueError, psycopg2.errors.CheckViolation, psycopg2.errors.IntegrityError, psycopg2.errors.InvalidTextRepresentation, psycopg2.errors.NotNullViolation, psycopg2.errors.NumericValueOutOfRange, psycopg2.errors.UndefinedFunction, # (raised on wrong argument typs) ) """Exceptions raised by postgresql when validation of the arguments failed.""" @contextlib.contextmanager def convert_validation_exceptions(): """Catches postgresql errors related to invalid arguments, and re-raises a StorageArgumentException.""" try: yield except tuple(VALIDATION_EXCEPTIONS) as e: raise StorageArgumentException(str(e)) class Storage: """SWH storage proxy, encompassing DB and object storage """ def __init__( self, db, objstorage, min_pool_conns=1, max_pool_conns=10, journal_writer=None ): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection obj_root: path to the root of the object storage """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) self.journal_writer = JournalWriter(journal_writer) self.objstorage = ObjStorage(objstorage) def get_db(self): if self._db: return self._db else: return Db.from_pool(self._pool) def put_db(self, db): if db is not self._db: db.put_conn() @contextmanager def db(self): db = None try: db = self.get_db() yield db finally: if db: self.put_db(db) @timed @db_transaction() def check_config(self, *, check_write: bool, db=None, cur=None) -> bool: if not self.objstorage.check_config(check_write=check_write): return False # Check permissions on one of the tables if check_write: check = "INSERT" else: check = "SELECT" cur.execute("select has_table_privilege(current_user, 'content', %s)", (check,)) return cur.fetchone()[0] def _content_unique_key(self, hash, db): """Given a hash (tuple or dict), return a unique key from the aggregation of keys. """ keys = db.content_hash_keys if isinstance(hash, tuple): return hash return tuple([hash[k] for k in keys]) def _content_add_metadata(self, db, cur, content): """Add content to the postgresql database but not the object storage. """ # create temporary table for metadata injection db.mktemp("content", cur) db.copy_to( (c.to_dict() for c in content), "tmp_content", db.content_add_keys, cur ) # move metadata in place try: db.content_add_from_temp(cur) except psycopg2.IntegrityError as e: if e.diag.sqlstate == "23505" and e.diag.table_name == "content": message_detail = e.diag.message_detail if message_detail: hash_name, hash_id = extract_collision_hash(message_detail) collision_contents_hashes = [ c.hashes() for c in content if c.get_hash(hash_name) == hash_id ] else: constraint_to_hash_name = { "content_pkey": "sha1", "content_sha1_git_idx": "sha1_git", "content_sha256_idx": "sha256", } hash_name = constraint_to_hash_name.get(e.diag.constraint_name) hash_id = None collision_contents_hashes = None raise HashCollision( hash_name, hash_id, collision_contents_hashes ) from None else: raise @timed @process_metrics def content_add(self, content: List[Content]) -> Dict: ctime = now() contents = [attr.evolve(c, ctime=ctime) for c in content] objstorage_summary = self.objstorage.content_add(contents) with self.db() as db: with db.transaction() as cur: missing = list( self.content_missing( map(Content.to_dict, contents), key_hash="sha1_git", db=db, cur=cur, ) ) contents = [c for c in contents if c.sha1_git in missing] self.journal_writer.content_add(contents) self._content_add_metadata(db, cur, contents) return { "content:add": len(contents), "content:add:bytes": objstorage_summary["content:add:bytes"], } @timed @db_transaction() def content_update( self, contents: List[Dict[str, Any]], keys: List[str] = [], db=None, cur=None ) -> None: # TODO: Add a check on input keys. How to properly implement # this? We don't know yet the new columns. self.journal_writer.content_update(contents) db.mktemp("content", cur) select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) with convert_validation_exceptions(): db.copy_to(contents, "tmp_content", select_keys, cur) db.content_update_from_temp(keys_to_update=keys, cur=cur) @timed @process_metrics @db_transaction() def content_add_metadata(self, content: List[Content], db=None, cur=None) -> Dict: missing = self.content_missing( (c.to_dict() for c in content), key_hash="sha1_git", db=db, cur=cur, ) contents = [c for c in content if c.sha1_git in missing] self.journal_writer.content_add_metadata(contents) self._content_add_metadata(db, cur, contents) return { "content:add": len(contents), } @timed def content_get_data(self, content: Sha1) -> Optional[bytes]: # FIXME: Make this method support slicing the `data` return self.objstorage.content_get(content) @timed @db_transaction() def content_get_partition( self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[Content]: if limit is None: raise StorageArgumentException("limit should not be None") (start, end) = get_partition_bounds_bytes( partition_id, nb_partitions, SHA1_SIZE ) if page_token: start = hash_to_bytes(page_token) if end is None: end = b"\xff" * SHA1_SIZE next_page_token: Optional[str] = None contents = [] for counter, row in enumerate(db.content_get_range(start, end, limit + 1, cur)): row_d = dict(zip(db.content_get_metadata_keys, row)) content = Content(**row_d) if counter >= limit: # take the last content for the next page starting from this next_page_token = hash_to_hex(content.sha1) break contents.append(content) assert len(contents) <= limit return PagedResult(results=contents, next_page_token=next_page_token) @timed @db_transaction(statement_timeout=500) def content_get( self, contents: List[Sha1], db=None, cur=None ) -> List[Optional[Content]]: contents_by_sha1: Dict[Sha1, Optional[Content]] = {} for row in db.content_get_metadata_from_sha1s(contents, cur): row_d = dict(zip(db.content_get_metadata_keys, row)) content = Content(**row_d) contents_by_sha1[content.sha1] = content return [contents_by_sha1.get(sha1) for sha1 in contents] @timed @db_transaction_generator() def content_missing( self, contents: List[Dict[str, Any]], key_hash: str = "sha1", db=None, cur=None ) -> Iterable[bytes]: if key_hash not in DEFAULT_ALGORITHMS: raise StorageArgumentException( "key_hash should be one of {','.join(DEFAULT_ALGORITHMS)}" ) keys = db.content_hash_keys key_hash_idx = keys.index(key_hash) for obj in db.content_missing_from_list(contents, cur): yield obj[key_hash_idx] @timed @db_transaction_generator() def content_missing_per_sha1( self, contents: List[bytes], db=None, cur=None ) -> Iterable[bytes]: for obj in db.content_missing_per_sha1(contents, cur): yield obj[0] @timed @db_transaction_generator() def content_missing_per_sha1_git( self, contents: List[bytes], db=None, cur=None ) -> Iterable[Sha1Git]: for obj in db.content_missing_per_sha1_git(contents, cur): yield obj[0] @timed @db_transaction() def content_find(self, content: Dict[str, Any], db=None, cur=None) -> List[Content]: if not set(content).intersection(DEFAULT_ALGORITHMS): raise StorageArgumentException( "content keys must contain at least one " f"of: {', '.join(sorted(DEFAULT_ALGORITHMS))}" ) rows = db.content_find( sha1=content.get("sha1"), sha1_git=content.get("sha1_git"), sha256=content.get("sha256"), blake2s256=content.get("blake2s256"), cur=cur, ) contents = [] for row in rows: row_d = dict(zip(db.content_find_cols, row)) contents.append(Content(**row_d)) return contents @timed @db_transaction() def content_get_random(self, db=None, cur=None) -> Sha1Git: return db.content_get_random(cur) @staticmethod def _skipped_content_normalize(d): d = d.copy() if d.get("status") is None: d["status"] = "absent" if d.get("length") is None: d["length"] = -1 return d def _skipped_content_add_metadata(self, db, cur, content: List[SkippedContent]): origin_ids = db.origin_id_get_by_url([cont.origin for cont in content], cur=cur) content = [ attr.evolve(c, origin=origin_id) for (c, origin_id) in zip(content, origin_ids) ] db.mktemp("skipped_content", cur) db.copy_to( [c.to_dict() for c in content], "tmp_skipped_content", db.skipped_content_keys, cur, ) # move metadata in place db.skipped_content_add_from_temp(cur) @timed @process_metrics @db_transaction() def skipped_content_add( self, content: List[SkippedContent], db=None, cur=None ) -> Dict: ctime = now() content = [attr.evolve(c, ctime=ctime) for c in content] missing_contents = self.skipped_content_missing( (c.to_dict() for c in content), db=db, cur=cur, ) content = [ c for c in content if any( all( c.get_hash(algo) == missing_content.get(algo) for algo in DEFAULT_ALGORITHMS ) for missing_content in missing_contents ) ] self.journal_writer.skipped_content_add(content) self._skipped_content_add_metadata(db, cur, content) return { "skipped_content:add": len(content), } @timed @db_transaction_generator() def skipped_content_missing( self, contents: List[Dict[str, Any]], db=None, cur=None ) -> Iterable[Dict[str, Any]]: contents = list(contents) for content in db.skipped_content_missing(contents, cur): yield dict(zip(db.content_hash_keys, content)) @timed @process_metrics @db_transaction() def directory_add(self, directories: List[Directory], db=None, cur=None) -> Dict: summary = {"directory:add": 0} dirs = set() dir_entries: Dict[str, defaultdict] = { "file": defaultdict(list), "dir": defaultdict(list), "rev": defaultdict(list), } for cur_dir in directories: dir_id = cur_dir.id dirs.add(dir_id) for src_entry in cur_dir.entries: entry = src_entry.to_dict() entry["dir_id"] = dir_id dir_entries[entry["type"]][dir_id].append(entry) dirs_missing = set(self.directory_missing(dirs, db=db, cur=cur)) if not dirs_missing: return summary self.journal_writer.directory_add( dir_ for dir_ in directories if dir_.id in dirs_missing ) # Copy directory ids dirs_missing_dict = ({"id": dir} for dir in dirs_missing) db.mktemp("directory", cur) db.copy_to(dirs_missing_dict, "tmp_directory", ["id"], cur) # Copy entries for entry_type, entry_list in dir_entries.items(): entries = itertools.chain.from_iterable( entries_for_dir for dir_id, entries_for_dir in entry_list.items() if dir_id in dirs_missing ) db.mktemp_dir_entry(entry_type) db.copy_to( entries, "tmp_directory_entry_%s" % entry_type, ["target", "name", "perms", "dir_id"], cur, ) # Do the final copy db.directory_add_from_temp(cur) summary["directory:add"] = len(dirs_missing) return summary @timed @db_transaction_generator() def directory_missing( self, directories: List[Sha1Git], db=None, cur=None ) -> Iterable[Sha1Git]: for obj in db.directory_missing_from_list(directories, cur): yield obj[0] @timed @db_transaction_generator(statement_timeout=20000) def directory_ls( self, directory: Sha1Git, recursive: bool = False, db=None, cur=None ) -> Iterable[Dict[str, Any]]: if recursive: res_gen = db.directory_walk(directory, cur=cur) else: res_gen = db.directory_walk_one(directory, cur=cur) for line in res_gen: yield dict(zip(db.directory_ls_cols, line)) @timed @db_transaction(statement_timeout=2000) def directory_entry_get_by_path( self, directory: Sha1Git, paths: List[bytes], db=None, cur=None ) -> Optional[Dict[str, Any]]: res = db.directory_entry_get_by_path(directory, paths, cur) return dict(zip(db.directory_ls_cols, res)) if res else None @timed @db_transaction() def directory_get_random(self, db=None, cur=None) -> Sha1Git: return db.directory_get_random(cur) @timed @process_metrics @db_transaction() def revision_add(self, revisions: List[Revision], db=None, cur=None) -> Dict: summary = {"revision:add": 0} revisions_missing = set( self.revision_missing( set(revision.id for revision in revisions), db=db, cur=cur ) ) if not revisions_missing: return summary db.mktemp_revision(cur) revisions_filtered = [ revision for revision in revisions if revision.id in revisions_missing ] self.journal_writer.revision_add(revisions_filtered) - revisions_filtered = list(map(converters.revision_to_db, revisions_filtered)) + db_revisions_filtered = list(map(converters.revision_to_db, revisions_filtered)) parents_filtered: List[bytes] = [] with convert_validation_exceptions(): db.copy_to( - revisions_filtered, + db_revisions_filtered, "tmp_revision", db.revision_add_cols, cur, lambda rev: parents_filtered.extend(rev["parents"]), ) db.revision_add_from_temp(cur) db.copy_to( parents_filtered, "revision_history", ["id", "parent_id", "parent_rank"], cur, ) return {"revision:add": len(revisions_missing)} @timed @db_transaction_generator() def revision_missing( self, revisions: List[Sha1Git], db=None, cur=None ) -> Iterable[Sha1Git]: if not revisions: return None for obj in db.revision_missing_from_list(revisions, cur): yield obj[0] @timed @db_transaction_generator(statement_timeout=1000) def revision_get( self, revisions: List[Sha1Git], db=None, cur=None ) -> Iterable[Optional[Dict[str, Any]]]: for line in db.revision_get_from_list(revisions, cur): data = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) - if not data["type"]: + if not data: yield None continue - yield data + yield data.to_dict() @timed @db_transaction_generator(statement_timeout=2000) def revision_log( self, revisions: List[Sha1Git], limit: Optional[int] = None, db=None, cur=None ) -> Iterable[Optional[Dict[str, Any]]]: for line in db.revision_log(revisions, limit, cur): data = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) - if not data["type"]: + if not data: yield None continue - yield data + yield data.to_dict() @timed @db_transaction_generator(statement_timeout=2000) def revision_shortlog( self, revisions: List[Sha1Git], limit: Optional[int] = None, db=None, cur=None ) -> Iterable[Optional[Tuple[Sha1Git, Tuple[Sha1Git, ...]]]]: yield from db.revision_shortlog(revisions, limit, cur) @timed @db_transaction() def revision_get_random(self, db=None, cur=None) -> Sha1Git: return db.revision_get_random(cur) @timed @process_metrics @db_transaction() def release_add(self, releases: List[Release], db=None, cur=None) -> Dict: summary = {"release:add": 0} release_ids = set(release.id for release in releases) releases_missing = set(self.release_missing(release_ids, db=db, cur=cur)) if not releases_missing: return summary db.mktemp_release(cur) releases_filtered = [ release for release in releases if release.id in releases_missing ] self.journal_writer.release_add(releases_filtered) - releases_filtered = list(map(converters.release_to_db, releases_filtered)) + db_releases_filtered = list(map(converters.release_to_db, releases_filtered)) with convert_validation_exceptions(): - db.copy_to(releases_filtered, "tmp_release", db.release_add_cols, cur) + db.copy_to(db_releases_filtered, "tmp_release", db.release_add_cols, cur) db.release_add_from_temp(cur) return {"release:add": len(releases_missing)} @timed @db_transaction_generator() def release_missing( self, releases: List[Sha1Git], db=None, cur=None ) -> Iterable[Sha1Git]: if not releases: return for obj in db.release_missing_from_list(releases, cur): yield obj[0] @timed @db_transaction_generator(statement_timeout=500) def release_get( self, releases: List[Sha1Git], db=None, cur=None ) -> Iterable[Optional[Dict[str, Any]]]: for release in db.release_get_from_list(releases, cur): data = converters.db_to_release(dict(zip(db.release_get_cols, release))) - yield data if data["target_type"] else None + yield data.to_dict() if data else None @timed @db_transaction() def release_get_random(self, db=None, cur=None) -> Sha1Git: return db.release_get_random(cur) @timed @process_metrics @db_transaction() def snapshot_add(self, snapshots: List[Snapshot], db=None, cur=None) -> Dict: created_temp_table = False count = 0 for snapshot in snapshots: if not db.snapshot_exists(snapshot.id, cur): if not created_temp_table: db.mktemp_snapshot_branch(cur) created_temp_table = True with convert_validation_exceptions(): db.copy_to( ( { "name": name, "target": info.target if info else None, "target_type": ( info.target_type.value if info else None ), } for name, info in snapshot.branches.items() ), "tmp_snapshot_branch", ["name", "target", "target_type"], cur, ) self.journal_writer.snapshot_add([snapshot]) db.snapshot_add(snapshot.id, cur) count += 1 return {"snapshot:add": count} @timed @db_transaction_generator() def snapshot_missing( self, snapshots: List[Sha1Git], db=None, cur=None ) -> Iterable[Sha1Git]: for obj in db.snapshot_missing_from_list(snapshots, cur): yield obj[0] @timed @db_transaction(statement_timeout=2000) def snapshot_get( self, snapshot_id: Sha1Git, db=None, cur=None ) -> Optional[Dict[str, Any]]: d = self.snapshot_get_branches(snapshot_id) if d is None: return d return { "id": d["id"], "branches": { name: branch.to_dict() if branch else None for (name, branch) in d["branches"].items() }, "next_branch": d["next_branch"], } @timed @db_transaction(statement_timeout=2000) def snapshot_count_branches( self, snapshot_id: Sha1Git, db=None, cur=None ) -> Optional[Dict[Optional[str], int]]: return dict([bc for bc in db.snapshot_count_branches(snapshot_id, cur)]) @timed @db_transaction(statement_timeout=2000) def snapshot_get_branches( self, snapshot_id: Sha1Git, branches_from: bytes = b"", branches_count: int = 1000, target_types: Optional[List[str]] = None, db=None, cur=None, ) -> Optional[PartialBranches]: if snapshot_id == EMPTY_SNAPSHOT_ID: return PartialBranches(id=snapshot_id, branches={}, next_branch=None,) branches = {} next_branch = None fetched_branches = list( db.snapshot_get_by_id( snapshot_id, branches_from=branches_from, branches_count=branches_count + 1, target_types=target_types, cur=cur, ) ) for row in fetched_branches[:branches_count]: branch_d = dict(zip(db.snapshot_get_cols, row)) del branch_d["snapshot_id"] name = branch_d.pop("name") if branch_d["target"] is None and branch_d["target_type"] is None: branch = None else: assert branch_d["target_type"] is not None branch = SnapshotBranch( target=branch_d["target"], target_type=TargetType(branch_d["target_type"]), ) branches[name] = branch if len(fetched_branches) > branches_count: next_branch = dict(zip(db.snapshot_get_cols, fetched_branches[-1]))["name"] if branches: return PartialBranches( id=snapshot_id, branches=branches, next_branch=next_branch, ) return None @timed @db_transaction() def snapshot_get_random(self, db=None, cur=None) -> Sha1Git: return db.snapshot_get_random(cur) @timed @db_transaction() def origin_visit_add( self, visits: List[OriginVisit], db=None, cur=None ) -> Iterable[OriginVisit]: for visit in visits: origin = self.origin_get([visit.origin], db=db, cur=cur)[0] if not origin: # Cannot add a visit without an origin raise StorageArgumentException("Unknown origin %s", visit.origin) all_visits = [] nb_visits = 0 for visit in visits: nb_visits += 1 if not visit.visit: with convert_validation_exceptions(): visit_id = db.origin_visit_add( visit.origin, visit.date, visit.type, cur=cur ) visit = attr.evolve(visit, visit=visit_id) else: db.origin_visit_add_with_id(visit, cur=cur) assert visit.visit is not None all_visits.append(visit) # Forced to write after for the case when the visit has no id self.journal_writer.origin_visit_add([visit]) visit_status = OriginVisitStatus( origin=visit.origin, visit=visit.visit, date=visit.date, status="created", snapshot=None, ) self._origin_visit_status_add(visit_status, db=db, cur=cur) send_metric("origin_visit:add", count=nb_visits, method_name="origin_visit") return all_visits def _origin_visit_status_add( self, visit_status: OriginVisitStatus, db, cur ) -> None: """Add an origin visit status""" self.journal_writer.origin_visit_status_add([visit_status]) db.origin_visit_status_add(visit_status, cur=cur) send_metric( "origin_visit_status:add", count=1, method_name="origin_visit_status" ) @timed @db_transaction() def origin_visit_status_add( self, visit_statuses: List[OriginVisitStatus], db=None, cur=None, ) -> None: # First round to check existence (fail early if any is ko) for visit_status in visit_statuses: origin_url = self.origin_get([visit_status.origin], db=db, cur=cur)[0] if not origin_url: raise StorageArgumentException(f"Unknown origin {visit_status.origin}") for visit_status in visit_statuses: self._origin_visit_status_add(visit_status, db, cur) @timed @db_transaction() def origin_visit_status_get_latest( self, origin_url: str, visit: int, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, db=None, cur=None, ) -> Optional[OriginVisitStatus]: if allowed_statuses and not set(allowed_statuses).intersection(VISIT_STATUSES): raise StorageArgumentException( f"Unknown allowed statuses {','.join(allowed_statuses)}, only " f"{','.join(VISIT_STATUSES)} authorized" ) row = db.origin_visit_status_get_latest( origin_url, visit, allowed_statuses, require_snapshot, cur=cur ) if not row: return None return OriginVisitStatus.from_dict(row) @timed @db_transaction(statement_timeout=500) def origin_visit_get( self, origin: str, page_token: Optional[str] = None, order: ListOrder = ListOrder.ASC, limit: int = 10, db=None, cur=None, ) -> PagedResult[OriginVisit]: page_token = page_token or "0" if not isinstance(order, ListOrder): raise StorageArgumentException("order must be a ListOrder value") if not isinstance(page_token, str): raise StorageArgumentException("page_token must be a string.") next_page_token = None visit_from = int(page_token) visits: List[OriginVisit] = [] extra_limit = limit + 1 for row in db.origin_visit_get_range( origin, visit_from=visit_from, order=order, limit=extra_limit, cur=cur ): row_d = dict(zip(db.origin_visit_cols, row)) visits.append( OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) ) assert len(visits) <= extra_limit if len(visits) == extra_limit: visits = visits[:limit] next_page_token = str(visits[-1].visit) return PagedResult(results=visits, next_page_token=next_page_token) @timed @db_transaction(statement_timeout=500) def origin_visit_find_by_date( self, origin: str, visit_date: datetime.datetime, db=None, cur=None ) -> Optional[OriginVisit]: row_d = db.origin_visit_find_by_date(origin, visit_date, cur=cur) if not row_d: return None return OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) @timed @db_transaction(statement_timeout=500) def origin_visit_get_by( self, origin: str, visit: int, db=None, cur=None ) -> Optional[OriginVisit]: row = db.origin_visit_get(origin, visit, cur) if row: row_d = dict(zip(db.origin_visit_get_cols, row)) return OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) return None @timed @db_transaction(statement_timeout=4000) def origin_visit_get_latest( self, origin: str, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, db=None, cur=None, ) -> Optional[OriginVisit]: if allowed_statuses and not set(allowed_statuses).intersection(VISIT_STATUSES): raise StorageArgumentException( f"Unknown allowed statuses {','.join(allowed_statuses)}, only " f"{','.join(VISIT_STATUSES)} authorized" ) row = db.origin_visit_get_latest( origin, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, cur=cur, ) if row: row_d = dict(zip(db.origin_visit_get_cols, row)) visit = OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) return visit return None @timed @db_transaction(statement_timeout=500) def origin_visit_status_get( self, origin: str, visit: int, page_token: Optional[str] = None, order: ListOrder = ListOrder.ASC, limit: int = 10, db=None, cur=None, ) -> PagedResult[OriginVisitStatus]: next_page_token = None date_from = None if page_token is not None: date_from = datetime.datetime.fromisoformat(page_token) visit_statuses: List[OriginVisitStatus] = [] # Take one more visit status so we can reuse it as the next page token if any for row in db.origin_visit_status_get_range( origin, visit, date_from=date_from, order=order, limit=limit + 1, cur=cur, ): row_d = dict(zip(db.origin_visit_status_cols, row)) visit_statuses.append( OriginVisitStatus( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], status=row_d["status"], snapshot=row_d["snapshot"], metadata=row_d["metadata"], ) ) if len(visit_statuses) > limit: # last visit status date is the next page token next_page_token = str(visit_statuses[-1].date) # excluding that visit status from the result to respect the limit size visit_statuses = visit_statuses[:limit] return PagedResult(results=visit_statuses, next_page_token=next_page_token) @timed @db_transaction() def origin_visit_status_get_random( self, type: str, db=None, cur=None ) -> Optional[Tuple[OriginVisit, OriginVisitStatus]]: row = db.origin_visit_get_random(type, cur) if row is not None: row_d = dict(zip(db.origin_visit_get_cols, row)) visit = OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) visit_status = OriginVisitStatus( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], status=row_d["status"], metadata=row_d["metadata"], snapshot=row_d["snapshot"], ) return visit, visit_status return None @timed @db_transaction(statement_timeout=2000) def object_find_by_sha1_git( self, ids: List[Sha1Git], db=None, cur=None ) -> Dict[Sha1Git, List[Dict]]: ret: Dict[Sha1Git, List[Dict]] = {id: [] for id in ids} for retval in db.object_find_by_sha1_git(ids, cur=cur): if retval[1]: ret[retval[0]].append( dict(zip(db.object_find_by_sha1_git_cols, retval)) ) return ret @timed @db_transaction(statement_timeout=500) def origin_get( self, origins: List[str], db=None, cur=None ) -> Iterable[Optional[Origin]]: rows = db.origin_get_by_url(origins, cur) result: List[Optional[Origin]] = [] for row in rows: origin_d = dict(zip(db.origin_cols, row)) url = origin_d["url"] result.append(None if url is None else Origin(url=url)) return result @timed @db_transaction(statement_timeout=500) def origin_get_by_sha1( self, sha1s: List[bytes], db=None, cur=None ) -> List[Optional[Dict[str, Any]]]: return [ dict(zip(db.origin_cols, row)) if row[0] else None for row in db.origin_get_by_sha1(sha1s, cur) ] @timed @db_transaction_generator() def origin_get_range(self, origin_from=1, origin_count=100, db=None, cur=None): for origin in db.origin_get_range(origin_from, origin_count, cur): yield dict(zip(db.origin_get_range_cols, origin)) @timed @db_transaction() def origin_list( self, page_token: Optional[str] = None, limit: int = 100, *, db=None, cur=None ) -> PagedResult[Origin]: page_token = page_token or "0" if not isinstance(page_token, str): raise StorageArgumentException("page_token must be a string.") origin_from = int(page_token) next_page_token = None origins: List[Origin] = [] # Take one more origin so we can reuse it as the next page token if any for row_d in self.origin_get_range(origin_from, limit + 1, db=db, cur=cur): origins.append(Origin(url=row_d["url"])) # keep the last_id for the pagination if needed last_id = row_d["id"] if len(origins) > limit: # data left for subsequent call # last origin id is the next page token next_page_token = str(last_id) # excluding that origin from the result to respect the limit size origins = origins[:limit] assert len(origins) <= limit return PagedResult(results=origins, next_page_token=next_page_token) @timed @db_transaction() def origin_search( self, url_pattern: str, page_token: Optional[str] = None, limit: int = 50, regexp: bool = False, with_visit: bool = False, db=None, cur=None, ) -> PagedResult[Origin]: next_page_token = None offset = int(page_token) if page_token else 0 origins = [] # Take one more origin so we can reuse it as the next page token if any for origin in db.origin_search( url_pattern, offset, limit + 1, regexp, with_visit, cur ): row_d = dict(zip(db.origin_cols, origin)) origins.append(Origin(url=row_d["url"])) if len(origins) > limit: # next offset next_page_token = str(offset + limit) # excluding that origin from the result to respect the limit size origins = origins[:limit] assert len(origins) <= limit return PagedResult(results=origins, next_page_token=next_page_token) @timed @db_transaction() def origin_count( self, url_pattern: str, regexp: bool = False, with_visit: bool = False, db=None, cur=None, ) -> int: return db.origin_count(url_pattern, regexp, with_visit, cur) @timed @process_metrics @db_transaction() def origin_add(self, origins: List[Origin], db=None, cur=None) -> Dict[str, int]: urls = [o.url for o in origins] known_origins = set(url for (url,) in db.origin_get_by_url(urls, cur)) # use lists here to keep origins sorted; some tests depend on this to_add = [url for url in urls if url not in known_origins] self.journal_writer.origin_add([Origin(url=url) for url in to_add]) added = 0 for url in to_add: if db.origin_add(url, cur): added += 1 return {"origin:add": added} @db_transaction(statement_timeout=500) def stat_counters(self, db=None, cur=None): return {k: v for (k, v) in db.stat_counters()} @db_transaction() def refresh_stat_counters(self, db=None, cur=None): keys = [ "content", "directory", "directory_entry_dir", "directory_entry_file", "directory_entry_rev", "origin", "origin_visit", "person", "release", "revision", "revision_history", "skipped_content", "snapshot", ] for key in keys: cur.execute("select * from swh_update_counter(%s)", (key,)) @db_transaction() def raw_extrinsic_metadata_add( self, metadata: List[RawExtrinsicMetadata], db, cur, ) -> None: metadata = list(metadata) self.journal_writer.raw_extrinsic_metadata_add(metadata) counter = Counter[MetadataTargetType]() for metadata_entry in metadata: authority_id = self._get_authority_id(metadata_entry.authority, db, cur) fetcher_id = self._get_fetcher_id(metadata_entry.fetcher, db, cur) db.raw_extrinsic_metadata_add( type=metadata_entry.type.value, id=str(metadata_entry.id), discovery_date=metadata_entry.discovery_date, authority_id=authority_id, fetcher_id=fetcher_id, format=metadata_entry.format, metadata=metadata_entry.metadata, origin=metadata_entry.origin, visit=metadata_entry.visit, snapshot=map_optional(str, metadata_entry.snapshot), release=map_optional(str, metadata_entry.release), revision=map_optional(str, metadata_entry.revision), path=metadata_entry.path, directory=map_optional(str, metadata_entry.directory), cur=cur, ) counter[metadata_entry.type] += 1 for (type, count) in counter.items(): send_metric( f"{type.value}_metadata:add", count=count, method_name=f"{type.value}_metadata_add", ) @db_transaction() def raw_extrinsic_metadata_get( self, type: MetadataTargetType, id: Union[str, SWHID], authority: MetadataAuthority, after: Optional[datetime.datetime] = None, page_token: Optional[bytes] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[RawExtrinsicMetadata]: if type == MetadataTargetType.ORIGIN: if isinstance(id, SWHID): raise StorageArgumentException( f"raw_extrinsic_metadata_get called with type='origin', " f"but provided id is an SWHID: {id!r}" ) else: if not isinstance(id, SWHID): raise StorageArgumentException( f"raw_extrinsic_metadata_get called with type!='origin', " f"but provided id is not an SWHID: {id!r}" ) if page_token: (after_time, after_fetcher) = msgpack_loads(base64.b64decode(page_token)) if after and after_time < after: raise StorageArgumentException( "page_token is inconsistent with the value of 'after'." ) else: after_time = after after_fetcher = None authority_id = self._get_authority_id(authority, db, cur) if not authority_id: return PagedResult(next_page_token=None, results=[],) rows = db.raw_extrinsic_metadata_get( type, str(id), authority_id, after_time, after_fetcher, limit + 1, cur, ) rows = [dict(zip(db.raw_extrinsic_metadata_get_cols, row)) for row in rows] results = [] for row in rows: assert str(id) == row["raw_extrinsic_metadata.id"] results.append(converters.db_to_raw_extrinsic_metadata(row)) if len(results) > limit: results.pop() assert len(results) == limit last_returned_row = rows[-2] # rows[-1] corresponds to the popped result next_page_token: Optional[str] = base64.b64encode( msgpack_dumps( ( last_returned_row["discovery_date"], last_returned_row["metadata_fetcher.id"], ) ) ).decode() else: next_page_token = None return PagedResult(next_page_token=next_page_token, results=results,) @timed @db_transaction() def metadata_fetcher_add( self, fetchers: List[MetadataFetcher], db=None, cur=None ) -> None: fetchers = list(fetchers) self.journal_writer.metadata_fetcher_add(fetchers) count = 0 for fetcher in fetchers: if fetcher.metadata is None: raise StorageArgumentException( "MetadataFetcher.metadata may not be None in metadata_fetcher_add." ) db.metadata_fetcher_add( fetcher.name, fetcher.version, dict(fetcher.metadata), cur=cur ) count += 1 send_metric("metadata_fetcher:add", count=count, method_name="metadata_fetcher") @timed @db_transaction(statement_timeout=500) def metadata_fetcher_get( self, name: str, version: str, db=None, cur=None ) -> Optional[MetadataFetcher]: row = db.metadata_fetcher_get(name, version, cur=cur) if not row: return None return MetadataFetcher.from_dict(dict(zip(db.metadata_fetcher_cols, row))) @timed @db_transaction() def metadata_authority_add( self, authorities: List[MetadataAuthority], db=None, cur=None ) -> None: authorities = list(authorities) self.journal_writer.metadata_authority_add(authorities) count = 0 for authority in authorities: if authority.metadata is None: raise StorageArgumentException( "MetadataAuthority.metadata may not be None in " "metadata_authority_add." ) db.metadata_authority_add( authority.type.value, authority.url, dict(authority.metadata), cur=cur ) count += 1 send_metric( "metadata_authority:add", count=count, method_name="metadata_authority" ) @timed @db_transaction() def metadata_authority_get( self, type: MetadataAuthorityType, url: str, db=None, cur=None ) -> Optional[MetadataAuthority]: row = db.metadata_authority_get(type.value, url, cur=cur) if not row: return None return MetadataAuthority.from_dict(dict(zip(db.metadata_authority_cols, row))) @timed def diff_directories(self, from_dir, to_dir, track_renaming=False): return diff.diff_directories(self, from_dir, to_dir, track_renaming) @timed def diff_revisions(self, from_rev, to_rev, track_renaming=False): return diff.diff_revisions(self, from_rev, to_rev, track_renaming) @timed def diff_revision(self, revision, track_renaming=False): return diff.diff_revision(self, revision, track_renaming) def clear_buffers(self, object_types: Optional[List[str]] = None) -> None: """Do nothing """ return None def flush(self, object_types: Optional[List[str]] = None) -> Dict: return {} def _get_authority_id(self, authority: MetadataAuthority, db, cur): authority_id = db.metadata_authority_get_id( authority.type.value, authority.url, cur ) if not authority_id: raise StorageArgumentException(f"Unknown authority {authority}") return authority_id def _get_fetcher_id(self, fetcher: MetadataFetcher, db, cur): fetcher_id = db.metadata_fetcher_get_id(fetcher.name, fetcher.version, cur) if not fetcher_id: raise StorageArgumentException(f"Unknown fetcher {fetcher}") return fetcher_id diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py index 55a49f87..f9347e92 100644 --- a/swh/storage/tests/test_converters.py +++ b/swh/storage/tests/test_converters.py @@ -1,151 +1,167 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from swh.model.model import ( + ObjectType, + Person, + Release, + Revision, + RevisionType, + Timestamp, + TimestampWithTimezone, +) + from swh.storage import converters def test_date_to_db(): date_to_db = converters.date_to_db assert date_to_db(None) == {"timestamp": None, "offset": 0, "neg_utc_offset": None} assert date_to_db( - {"timestamp": 1234567890, "offset": 120, "negative_utc": False,} + TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0,), + offset=120, + negative_utc=False, + ) ) == { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, } assert date_to_db( - {"timestamp": 1123456789, "offset": 0, "negative_utc": True,} + TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=0,), + offset=0, + negative_utc=True, + ) ) == { "timestamp": "2005-08-07T23:19:49+00:00", "offset": 0, "neg_utc_offset": True, } assert date_to_db( - {"timestamp": 1234567890, "offset": 42, "negative_utc": False,} + TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0,), + offset=42, + negative_utc=False, + ) ) == { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 42, "neg_utc_offset": False, } assert date_to_db( - {"timestamp": 1634366813, "offset": -120, "negative_utc": False,} + TimestampWithTimezone( + timestamp=Timestamp(seconds=1634366813, microseconds=0,), + offset=-120, + negative_utc=False, + ) ) == { "timestamp": "2021-10-16T06:46:53+00:00", "offset": -120, "neg_utc_offset": False, } def test_db_to_author(): # when actual_author = converters.db_to_author(b"fullname", b"name", b"email") # then - assert actual_author == { - "fullname": b"fullname", - "name": b"name", - "email": b"email", - } + assert actual_author == Person(fullname=b"fullname", name=b"name", email=b"email",) def test_db_to_author_none(): # when actual_author = converters.db_to_author(None, None, None) # then assert actual_author is None def test_db_to_revision(): # when actual_revision = converters.db_to_revision( { "id": b"revision-id", "date": None, "date_offset": None, "date_neg_utc_offset": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, "type": "git", "directory": b"dir-sha1", "message": b"commit message", "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", "committer_fullname": b"comm-fullname", "committer_name": b"comm-name", "committer_email": b"comm-email", "metadata": {}, "synthetic": False, "extra_headers": (), "parents": [b"123", b"456"], } ) # then - assert actual_revision == { - "id": b"revision-id", - "author": { - "fullname": b"auth-fullname", - "name": b"auth-name", - "email": b"auth-email", - }, - "date": None, - "committer": { - "fullname": b"comm-fullname", - "name": b"comm-name", - "email": b"comm-email", - }, - "committer_date": None, - "type": "git", - "directory": b"dir-sha1", - "message": b"commit message", - "metadata": {}, - "synthetic": False, - "extra_headers": (), - "parents": [b"123", b"456"], - } + assert actual_revision == Revision( + id=b"revision-id", + author=Person( + fullname=b"auth-fullname", name=b"auth-name", email=b"auth-email", + ), + date=None, + committer=Person( + fullname=b"comm-fullname", name=b"comm-name", email=b"comm-email", + ), + committer_date=None, + type=RevisionType.GIT, + directory=b"dir-sha1", + message=b"commit message", + metadata={}, + synthetic=False, + extra_headers=(), + parents=(b"123", b"456"), + ) def test_db_to_release(): # when actual_release = converters.db_to_release( { "id": b"release-id", "target": b"revision-id", "target_type": "revision", "date": None, "date_offset": None, "date_neg_utc_offset": None, "name": b"release-name", "comment": b"release comment", "synthetic": True, "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", } ) # then - assert actual_release == { - "author": { - "fullname": b"auth-fullname", - "name": b"auth-name", - "email": b"auth-email", - }, - "date": None, - "id": b"release-id", - "name": b"release-name", - "message": b"release comment", - "synthetic": True, - "target": b"revision-id", - "target_type": "revision", - } + assert actual_release == Release( + author=Person( + fullname=b"auth-fullname", name=b"auth-name", email=b"auth-email", + ), + date=None, + id=b"release-id", + name=b"release-name", + message=b"release comment", + synthetic=True, + target=b"revision-id", + target_type=ObjectType.REVISION, + )