Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/graph.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | from __future__ import annotations | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | |||||
import os | import os | ||||
from typing import Any, Dict, Optional, Set | from typing import Any, Dict, Optional, Set | ||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .interface import ProvenanceInterface | from .interface import ProvenanceInterface | ||||
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines | ) -> IsochroneNode: | ||||
# that are not already known. | # that are not already known. | ||||
# | # | ||||
# 2. compute the maxdate for each node of the tree that was not found in the DB. | # 2. compute the maxdate for each node of the tree that was not found in the DB. | ||||
# Build the nodes structure | # Build the nodes structure | ||||
root_date = provenance.directory_get_date_in_isochrone_frontier(directory) | root_date = provenance.directory_get_date_in_isochrone_frontier(directory) | ||||
root = IsochroneNode(directory, dbdate=root_date) | root = IsochroneNode(directory, dbdate=root_date) | ||||
stack = [root] | stack = [root] | ||||
logging.debug( | |||||
f"Recursively creating isochrone graph for revision {revision.id.hex()}..." | |||||
) | |||||
fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} | fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} | ||||
while stack: | while stack: | ||||
current = stack.pop() | current = stack.pop() | ||||
if current.dbdate is None or current.dbdate > revision.date: | if current.dbdate is None or current.dbdate > revision.date: | ||||
# If current directory has an associated date in the isochrone frontier that | # If current directory has an associated date in the isochrone frontier that | ||||
# is greater or equal to the current revision's one, it should be ignored as | # is greater or equal to the current revision's one, it should be ignored as | ||||
# the revision is being processed out of order. | # the revision is being processed out of order. | ||||
if current.dbdate is not None and current.dbdate > revision.date: | if current.dbdate is not None and current.dbdate > revision.date: | ||||
logging.debug( | |||||
f"Invalidating frontier on {current.entry.id.hex()}" | |||||
f" (date {current.dbdate})" | |||||
f" when processing revision {revision.id.hex()}" | |||||
f" (date {revision.date})" | |||||
) | |||||
current.invalidate() | current.invalidate() | ||||
# Pre-query all known dates for directories in the current directory | # Pre-query all known dates for directories in the current directory | ||||
# for the provenance object to have them cached and (potentially) improve | # for the provenance object to have them cached and (potentially) improve | ||||
# performance. | # performance. | ||||
current.entry.retrieve_children(archive) | current.entry.retrieve_children(archive) | ||||
ddates = provenance.directory_get_dates_in_isochrone_frontier( | ddates = provenance.directory_get_dates_in_isochrone_frontier( | ||||
current.entry.dirs | current.entry.dirs | ||||
) | ) | ||||
for dir in current.entry.dirs: | for dir in current.entry.dirs: | ||||
# Recursively analyse subdirectory nodes | # Recursively analyse subdirectory nodes | ||||
node = current.add_directory(dir, date=ddates.get(dir.id, None)) | node = current.add_directory(dir, date=ddates.get(dir.id, None)) | ||||
stack.append(node) | stack.append(node) | ||||
fdates.update(provenance.content_get_early_dates(current.entry.files)) | fdates.update(provenance.content_get_early_dates(current.entry.files)) | ||||
logging.debug( | |||||
f"Isochrone graph for revision {revision.id.hex()} successfully created!" | |||||
) | |||||
# Precalculate max known date for each node in the graph (only directory nodes are | # Precalculate max known date for each node in the graph (only directory nodes are | ||||
# pushed to the stack). | # pushed to the stack). | ||||
logging.debug(f"Computing maxdates for revision {revision.id.hex()}...") | |||||
stack = [root] | stack = [root] | ||||
while stack: | while stack: | ||||
current = stack.pop() | current = stack.pop() | ||||
# Current directory node is known if it already has an assigned date (ie. it was | # Current directory node is known if it already has an assigned date (ie. it was | ||||
# already seen as an isochrone frontier). | # already seen as an isochrone frontier). | ||||
if current.known: | if current.known: | ||||
assert current.maxdate is None | assert current.maxdate is None | ||||
Show All 34 Lines | while stack: | ||||
# possible | # possible | ||||
and all((file.id in fdates) for file in current.entry.files) | and all((file.id in fdates) for file in current.entry.files) | ||||
) | ) | ||||
else: | else: | ||||
# at least one content is being processed out-of-order, then current | # at least one content is being processed out-of-order, then current | ||||
# node should be treated as unknown | # node should be treated as unknown | ||||
current.maxdate = revision.date | current.maxdate = revision.date | ||||
current.known = False | current.known = False | ||||
logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!") | |||||
return root | return root |