diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py --- a/swh/provenance/mongo/backend.py +++ b/swh/provenance/mongo/backend.py @@ -140,11 +140,14 @@ yield from sorted(occurs, key=lambda x: (x.date, x.revision, x.origin, x.path)) def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: + # FIXME, add index in contnet sha1 and ts + # FIXME, do the timezone operation in mongo return { x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) + # FIXME try to avoid this loop and return directly in the needed format from mongo for x in self.db.content.find( - {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, - {"sha1": 1, "ts": 1, "_id": 0}, + {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, + {"sha1": 1, "ts": 1, "_id": 0}, ) } @@ -173,6 +176,8 @@ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: return { x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) + # FIXME try to avoid this loop and return directly in the needed format from mongo + # FIXME add ts to index in directory for x in self.db.directory.find( {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, {"sha1": 1, "ts": 1, "_id": 0}, @@ -180,12 +185,17 @@ } def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: - return { - x["sha1"] - for x in self.db.get_collection(entity.value).find( - {}, {"sha1": 1, "_id": 0} - ) - } + # only for tests + + return set(self.db.get_collection(entity.value).distinct('sha1')) + + # return { + # x["sha1"] + # # FIXME try to avoid this loop and return directly in the needed format from mongo + # for x in self.db.get_collection(entity.value).find( + # {}, {"sha1": 1, "_id": 0} + # ) + # } def location_add(self, paths: Iterable[bytes]) -> bool: # TODO: implement this methods if path are to be stored in a separate collection @@ -194,6 +204,7 @@ def location_get_all(self) -> Set[bytes]: contents = self.db.content.find({}, {"revision": 1, "_id": 0, "directory": 1}) paths: List[Iterable[bytes]] = [] + for content in contents: paths.extend(value for _, value in content["revision"].items()) paths.extend(value for _, value in content["directory"].items()) @@ -216,9 +227,22 @@ self.db.origin.insert_one({"sha1": sha1, "url": url}) return True + # origins = { + # x["sha1"]: x + # for x in self.db.origin.find( + # {"sha1": {"$in": list(urls)}}, {"sha1": 1, "url": 1, "_id": 1} + # ) + # } + # for sha1, url in urls.items(): + # if sha1 not in origins: + # # add new origin + # self.db.origin.insert_one({"sha1": sha1, "url": url}) + # return True + def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: return { x["sha1"]: x["url"] + # FIXME try to avoid this loop and return directly in the needed format from mongo for x in self.db.origin.find( {"sha1": {"$in": list(ids)}}, {"sha1": 1, "url": 1, "_id": 0} ) @@ -283,6 +307,7 @@ def relation_add( self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] ) -> bool: + src_relation, *_, dst_relation = relation.value.split("_") dst_objs = { diff --git a/swh/provenance/mongo/bootstrap.py b/swh/provenance/mongo/bootstrap.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/bootstrap.py @@ -0,0 +1,8 @@ +# FIXME, maybe this should not be part of the code + +collections = [] + +indexes = [] + +def bootstrap_db(): + pass diff --git a/swh/provenance/mongo/errors.py b/swh/provenance/mongo/errors.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/errors.py @@ -0,0 +1,6 @@ +class DBError(Exception): + # FIXME, add mongo specific logging + pass + +class EntiryError(Exception): + pass diff --git a/swh/provenance/mongo/models/cache.py b/swh/provenance/mongo/models/cache.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/cache.py @@ -0,0 +1,20 @@ +class Cache: + """ + A object cache layer, now use only sha1 as the key + An in memory implementation + """ + + def __init__(self, data): + self.data = self.data + + def _clear(self): + self.data = dict() + + def set_obj(self, obj): + self.data[obj.sha1] = obj + + def add_data(self, data): + pass + + def get(self): + self.data.get(key, None) diff --git a/swh/provenance/mongo/models/content.py b/swh/provenance/mongo/models/content.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/content.py @@ -0,0 +1,12 @@ +class Content(Entity): + collection = 'content' + model = {} + validate_model = False + + + def find_first(sef): + pass + + + def find_all(self): + pass diff --git a/swh/provenance/mongo/models/directory.py b/swh/provenance/mongo/models/directory.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/directory.py @@ -0,0 +1,4 @@ +class Directory(Entity): + collection = 'directory' + model = {} + validate_model = False diff --git a/swh/provenance/mongo/models/entity.py b/swh/provenance/mongo/models/entity.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/entity.py @@ -0,0 +1,60 @@ +from abc import ABCMeta + + +class Entity(ABCMeta): + """ + An object saved in the db + """ + + collection: str + model: dict + validate_model: bool + + @staticmethod + def factory(entity: str): + mapping = { + 'content': Content + 'directory': Directory + } + if entity in mapping: + return entity[mapping] + raise EntiryError(f"invalid entity type {entity}") + + def __int__(self, data): + self.model = self._load_model() + self.data = self._set_data(data) + + def _load_model(self): + return {} + + def _set_data(self, data): + pass + + def _validate(self): + self.db.command() # use json in schema.json + + def save(self): + if self.validate_model and self._validate(): + raise DataError() + save() + + def get(self, qry): + pass + + def _is_older_in_time(self): + return self.data.ts < ts + + def add_if_older(self): + if _is_older(): + self.save() + + def with_excetion_handle(self): + pass + + +class EntityList: + """ + List or array of entities and their operations + Operate mostly on object cache + """ + pass diff --git a/swh/provenance/mongo/models/origin.py b/swh/provenance/mongo/models/origin.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/origin.py @@ -0,0 +1,4 @@ +class Origin(Entity): + collection = 'origin' + model = {} + validate_model = False diff --git a/swh/provenance/mongo/models/revision.py b/swh/provenance/mongo/models/revision.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/models/revision.py @@ -0,0 +1,4 @@ +class Revision(Entity): + collection = 'revision' + model = {} + validate_model = False diff --git a/swh/provenance/mongo/schema.json b/swh/provenance/mongo/schema.json new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/schema.json @@ -0,0 +1,34 @@ +// The json schema for the first version + +{ + "content": { + "bsonType": "object", + "required": [ "sha1" ], + "properties": { + "sha1": { + "bsonType": "Binary", + "description": "" + }, + } + }, + 'directory': { + "bsonType": "object", + "required": [ "sha1" ], + "properties": { + "sha1": { + "bsonType": "Binary", + "description": "" + }, + } + }, + 'revision': { + "bsonType": "object", + "required": [ "sha1" ], + "properties": { + "sha1": { + "bsonType": "Binary", + "description": "" + }, + } + } +} diff --git a/swh/provenance/mongo/storage.py b/swh/provenance/mongo/storage.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/storage.py @@ -0,0 +1,104 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import os +from typing import Any, Dict, Generator, Iterable, List, Optional, Set, Union + +from bson import ObjectId +import pymongo.database + +from swh.model.model import Sha1Git + +from ..interface import ( + EntityType, + ProvenanceResult, + RelationData, + RelationType, + RevisionData, +) + + +class ProvenanceStorageMongoDb: + def __init__(self, db: pymongo.database.Database): + self.db = db + + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: + pass + + def content_find_all(self, id: Sha1Git, limit: Optional[int] = None) -> Generator[ProvenanceResult, None, None]: + pass + + def content_add(self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, datetime]]) -> bool: + for each_cnt in cnts: + try: + Entity.factory('content')(each_cnt).add_if_older() + except DBError as e: + # logging and skipping this item + # FIXME, add logging, raise if needed + pass + return True + + def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: + try: + contents = Entity.factory('content').get_all_in_list({'sha1': ids}) + except DBError as e: + # logging and returning None + # FIXME, add logging, raise if needed + return None + return Entity.factory('content').dump_list_as_date_dict(contents) + + def directory_add(self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, datetime]]) -> bool: + for each_cnt in cnts: + try: + Entity.factory('directory')(each_cnt).add_if_older() + except DBError as e: + # logging and skipping this item + # FIXME, add logging, raise if needed + pass + return True + + def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: + try: + directories = Entity.factory('directory').get_all_in_list({'sha1': ids}) + except DBError as e: + # logging and returning None + # FIXME, add logging, raise if needed + return None + return Entity.factory('directory').dump_list_as_date_dict(contents) + + def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: + return Entity.factory(entity).get_all_in_list({'sha1': ids}) + + def location_add(self, paths: Iterable[bytes]) -> bool: + # TODO: implement this methods if path are to be stored in a separate collection + return True + + def location_get_all(self) -> Set[bytes]: + pass + + def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: + return True + + def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: + pass + + def revision_add(self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]]) -> bool: + return True + + def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: + pass + + def relation_add(self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]]) -> bool: + return True + + def relation_get(self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False) -> Dict[Sha1Git, Set[RelationData]]: + return pass + + def relation_get_all(self, relation: RelationType) -> Dict[Sha1Git, Set[RelationData]]: + pass + + def with_path(self) -> bool: + return True diff --git a/swh/provenance/mongo/tests/__init__.py b/swh/provenance/mongo/tests/__init__.py new file mode 100644 --- /dev/null +++ b/swh/provenance/mongo/tests/__init__.py @@ -0,0 +1 @@ +pass