Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/mongo/backend.py
Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines | ) -> Generator[ProvenanceResult, None, None]: | ||||
), | ), | ||||
origin=origin["url"], | origin=origin["url"], | ||||
path=path, | path=path, | ||||
) | ) | ||||
) | ) | ||||
yield from sorted(occurs, key=lambda x: (x.date, x.revision, x.origin, x.path)) | yield from sorted(occurs, key=lambda x: (x.date, x.revision, x.origin, x.path)) | ||||
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: | def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: | ||||
# FIXME, add index in contnet sha1 and ts | |||||
# FIXME, do the timezone operation in mongo | |||||
return { | return { | ||||
x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) | x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) | ||||
# FIXME try to avoid this loop and return directly in the needed format from mongo | |||||
for x in self.db.content.find( | for x in self.db.content.find( | ||||
{"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, | {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, | ||||
{"sha1": 1, "ts": 1, "_id": 0}, | {"sha1": 1, "ts": 1, "_id": 0}, | ||||
) | ) | ||||
} | } | ||||
def directory_add( | def directory_add( | ||||
self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, datetime]] | self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, datetime]] | ||||
) -> bool: | ) -> bool: | ||||
data = dirs if isinstance(dirs, dict) else dict.fromkeys(dirs) | data = dirs if isinstance(dirs, dict) else dict.fromkeys(dirs) | ||||
existing = { | existing = { | ||||
Show All 12 Lines | ) -> bool: | ||||
) | ) | ||||
else: | else: | ||||
self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}}) | self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}}) | ||||
return True | return True | ||||
def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: | def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: | ||||
return { | return { | ||||
x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) | x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) | ||||
# FIXME try to avoid this loop and return directly in the needed format from mongo | |||||
# FIXME add ts to index in directory | |||||
for x in self.db.directory.find( | for x in self.db.directory.find( | ||||
{"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, | {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, | ||||
{"sha1": 1, "ts": 1, "_id": 0}, | {"sha1": 1, "ts": 1, "_id": 0}, | ||||
) | ) | ||||
} | } | ||||
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: | def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: | ||||
return { | # only for tests | ||||
x["sha1"] | |||||
for x in self.db.get_collection(entity.value).find( | return set(self.db.get_collection(entity.value).distinct('sha1')) | ||||
{}, {"sha1": 1, "_id": 0} | |||||
) | # return { | ||||
} | # x["sha1"] | ||||
# # FIXME try to avoid this loop and return directly in the needed format from mongo | |||||
# for x in self.db.get_collection(entity.value).find( | |||||
# {}, {"sha1": 1, "_id": 0} | |||||
# ) | |||||
# } | |||||
def location_add(self, paths: Iterable[bytes]) -> bool: | def location_add(self, paths: Iterable[bytes]) -> bool: | ||||
# TODO: implement this methods if path are to be stored in a separate collection | # TODO: implement this methods if path are to be stored in a separate collection | ||||
return True | return True | ||||
def location_get_all(self) -> Set[bytes]: | def location_get_all(self) -> Set[bytes]: | ||||
contents = self.db.content.find({}, {"revision": 1, "_id": 0, "directory": 1}) | contents = self.db.content.find({}, {"revision": 1, "_id": 0, "directory": 1}) | ||||
paths: List[Iterable[bytes]] = [] | paths: List[Iterable[bytes]] = [] | ||||
for content in contents: | for content in contents: | ||||
paths.extend(value for _, value in content["revision"].items()) | paths.extend(value for _, value in content["revision"].items()) | ||||
paths.extend(value for _, value in content["directory"].items()) | paths.extend(value for _, value in content["directory"].items()) | ||||
dirs = self.db.directory.find({}, {"revision": 1, "_id": 0}) | dirs = self.db.directory.find({}, {"revision": 1, "_id": 0}) | ||||
for each_dir in dirs: | for each_dir in dirs: | ||||
paths.extend(value for _, value in each_dir["revision"].items()) | paths.extend(value for _, value in each_dir["revision"].items()) | ||||
return set(sum(paths, [])) | return set(sum(paths, [])) | ||||
def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: | def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: | ||||
existing = { | existing = { | ||||
x["sha1"]: x | x["sha1"]: x | ||||
for x in self.db.origin.find( | for x in self.db.origin.find( | ||||
{"sha1": {"$in": list(orgs)}}, {"sha1": 1, "url": 1, "_id": 1} | {"sha1": {"$in": list(orgs)}}, {"sha1": 1, "url": 1, "_id": 1} | ||||
) | ) | ||||
} | } | ||||
for sha1, url in orgs.items(): | for sha1, url in orgs.items(): | ||||
if sha1 not in existing: | if sha1 not in existing: | ||||
# add new origin | # add new origin | ||||
self.db.origin.insert_one({"sha1": sha1, "url": url}) | self.db.origin.insert_one({"sha1": sha1, "url": url}) | ||||
return True | return True | ||||
# origins = { | |||||
# x["sha1"]: x | |||||
# for x in self.db.origin.find( | |||||
# {"sha1": {"$in": list(urls)}}, {"sha1": 1, "url": 1, "_id": 1} | |||||
# ) | |||||
# } | |||||
# for sha1, url in urls.items(): | |||||
# if sha1 not in origins: | |||||
# # add new origin | |||||
# self.db.origin.insert_one({"sha1": sha1, "url": url}) | |||||
# return True | |||||
def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: | def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: | ||||
return { | return { | ||||
x["sha1"]: x["url"] | x["sha1"]: x["url"] | ||||
# FIXME try to avoid this loop and return directly in the needed format from mongo | |||||
for x in self.db.origin.find( | for x in self.db.origin.find( | ||||
{"sha1": {"$in": list(ids)}}, {"sha1": 1, "url": 1, "_id": 0} | {"sha1": {"$in": list(ids)}}, {"sha1": 1, "url": 1, "_id": 0} | ||||
) | ) | ||||
} | } | ||||
def revision_add( | def revision_add( | ||||
self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]] | self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]] | ||||
) -> bool: | ) -> bool: | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: | ||||
}, | }, | ||||
{"sha1": 1, "preferred": 1, "ts": 1, "_id": 0}, | {"sha1": 1, "preferred": 1, "ts": 1, "_id": 0}, | ||||
) | ) | ||||
} | } | ||||
def relation_add( | def relation_add( | ||||
self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] | self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] | ||||
) -> bool: | ) -> bool: | ||||
src_relation, *_, dst_relation = relation.value.split("_") | src_relation, *_, dst_relation = relation.value.split("_") | ||||
dst_objs = { | dst_objs = { | ||||
x["sha1"]: x["_id"] | x["sha1"]: x["_id"] | ||||
for x in self.db.get_collection(dst_relation).find( | for x in self.db.get_collection(dst_relation).find( | ||||
{ | { | ||||
"sha1": { | "sha1": { | ||||
"$in": list({rel.dst for rels in data.values() for rel in rels}) | "$in": list({rel.dst for rels in data.values() for rel in rels}) | ||||
▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines |