Page MenuHomeSoftware Heritage

D5886.id21177.diff
No OneTemporary

D5886.id21177.diff

diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py
--- a/swh/provenance/graph.py
+++ b/swh/provenance/graph.py
@@ -1,10 +1,7 @@
-from collections import Counter
from datetime import datetime, timezone
import logging
import os
-from typing import Dict, List, Optional
-
-from swh.model.hashutil import hash_to_hex
+from typing import Dict, Optional, Set
from .archive import ArchiveInterface
from .model import DirectoryEntry, RevisionEntry
@@ -13,6 +10,73 @@
UTCMIN = datetime.min.replace(tzinfo=timezone.utc)
+class HistoryNode:
+ def __init__(
+ self, entry: RevisionEntry, visited: bool = False, in_history: bool = False
+ ):
+ self.entry = entry
+ # A revision is `visited` if it is directly pointed by an origin (ie. a head
+ # revision for some snapshot)
+ self.visited = visited
+ # A revision is `in_history` if it appears in the history graph of an already
+ # processed revision in the provenance database
+ self.in_history = in_history
+ self.parents: Set[HistoryNode] = set()
+
+ def add_parent(
+ self, parent: RevisionEntry, visited: bool = False, in_history: bool = False
+ ) -> "HistoryNode":
+ node = HistoryNode(parent, visited=visited, in_history=in_history)
+ self.parents.add(node)
+ return node
+
+ def __str__(self):
+ return (
+ f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, "
+ f"parents=[{', '.join(str(parent) for parent in self.parents)}]>"
+ )
+
+ def __eq__(self, other):
+ return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__
+
+ def __hash__(self):
+ return hash((self.entry, self.visited, self.in_history))
+
+
+def build_history_graph(
+ archive: ArchiveInterface,
+ provenance: ProvenanceInterface,
+ revision: RevisionEntry,
+) -> HistoryNode:
+ """Recursively build the history graph from the given revision"""
+
+ root = HistoryNode(
+ revision,
+ visited=provenance.revision_visited(revision),
+ in_history=provenance.revision_in_history(revision),
+ )
+ stack = [root]
+ logging.debug(
+ f"Recursively creating history graph for revision {revision.id.hex()}..."
+ )
+ while stack:
+ current = stack.pop()
+ if not current.visited:
+ current.entry.retrieve_parents(archive)
+
+ for rev in current.entry.parents:
+ node = current.add_parent(
+ rev,
+ visited=provenance.revision_visited(rev),
+ in_history=provenance.revision_in_history(rev),
+ )
+ stack.append(node)
+ logging.debug(
+ f"History graph for revision {revision.id.hex()} successfully created!"
+ )
+ return root
+
+
class IsochroneNode:
def __init__(
self,
@@ -36,7 +100,7 @@
self.known = self.dbdate is not None
self.invalid = False
self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name
- self.children: List[IsochroneNode] = []
+ self.children: Set[IsochroneNode] = set()
@property
def dbdate(self):
@@ -52,56 +116,27 @@
def add_directory(
self, child: DirectoryEntry, date: Optional[datetime] = None
) -> "IsochroneNode":
- # we should not be processing this node (ie add subdirectories or
- # files) if it's actually known by the provenance DB
+ # we should not be processing this node (ie add subdirectories or files) if it's
+ # actually known by the provenance DB
assert self.dbdate is None
node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path)
- self.children.append(node)
+ self.children.add(node)
return node
def __str__(self):
return (
- f"<{self.entry}: dbdate={self.dbdate}, maxdate={self.maxdate}, "
+ f"<{self.entry}: depth={self.depth}, "
+ f"dbdate={self.dbdate}, maxdate={self.maxdate}, "
f"known={self.known}, invalid={self.invalid}, path={self.path}, "
f"children=[{', '.join(str(child) for child in self.children)}]>"
)
def __eq__(self, other):
- return (
- isinstance(other, IsochroneNode)
- and (
- self.entry,
- self.depth,
- self._dbdate,
- self.maxdate,
- self.known,
- self.invalid,
- self.path,
- )
- == (
- other.entry,
- other.depth,
- other._dbdate,
- other.maxdate,
- other.known,
- other.invalid,
- other.path,
- )
- and Counter(self.children) == Counter(other.children)
- )
+ return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__
def __hash__(self):
- return hash(
- (
- self.entry,
- self.depth,
- self._dbdate,
- self.maxdate,
- self.known,
- self.invalid,
- self.path,
- )
- )
+ # only immutable attributes are considered to compute hash
+ return hash((self.entry, self.depth, self.path))
def build_isochrone_graph(
@@ -129,7 +164,7 @@
root = IsochroneNode(directory, dbdate=root_date)
stack = [root]
logging.debug(
- f"Recursively creating graph for revision {hash_to_hex(revision.id)}..."
+ f"Recursively creating isochrone graph for revision {revision.id.hex()}..."
)
fdates: Dict[bytes, datetime] = {} # map {file_id: date}
while stack:
@@ -140,9 +175,9 @@
# the revision is being processed out of order.
if current.dbdate is not None and current.dbdate > revision.date:
logging.debug(
- f"Invalidating frontier on {hash_to_hex(current.entry.id)}"
+ f"Invalidating frontier on {current.entry.id.hex()}"
f" (date {current.dbdate})"
- f" when processing revision {hash_to_hex(revision.id)}"
+ f" when processing revision {revision.id.hex()}"
f" (date {revision.date})"
)
current.invalidate()
@@ -162,11 +197,11 @@
fdates.update(provenance.content_get_early_dates(current.entry.files))
logging.debug(
- f"Isochrone graph for revision {hash_to_hex(revision.id)} successfully created!"
+ f"Isochrone graph for revision {revision.id.hex()} successfully created!"
)
# Precalculate max known date for each node in the graph (only directory nodes are
# pushed to the stack).
- logging.debug(f"Computing maxdates for revision {hash_to_hex(revision.id)}...")
+ logging.debug(f"Computing maxdates for revision {revision.id.hex()}...")
stack = [root]
while stack:
@@ -217,7 +252,5 @@
# node should be treated as unknown
current.maxdate = revision.date
current.known = False
- logging.debug(
- f"Maxdates for revision {hash_to_hex(revision.id)} successfully computed!"
- )
+ logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!")
return root
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -6,9 +6,8 @@
import iso8601
-from swh.model.hashutil import hash_to_hex
-
from .archive import ArchiveInterface
+from .graph import HistoryNode, build_history_graph
from .model import OriginEntry, RevisionEntry
from .provenance import ProvenanceInterface
@@ -48,76 +47,60 @@
provenance: ProvenanceInterface,
archive: ArchiveInterface,
origins: List[OriginEntry],
-) -> None:
+):
start = time.time()
for origin in origins:
origin.retrieve_revisions(archive)
for revision in origin.revisions:
- origin_add_revision(provenance, archive, origin, revision)
+ graph = build_history_graph(archive, provenance, revision)
+ origin_add_revision(provenance, origin, graph)
done = time.time()
provenance.commit()
stop = time.time()
logging.debug(
"Origins "
- ";".join(
- [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins]
- )
+ ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins])
+ f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
def origin_add_revision(
provenance: ProvenanceInterface,
- archive: ArchiveInterface,
origin: OriginEntry,
- revision: RevisionEntry,
-) -> None:
- stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)]
+ graph: HistoryNode,
+):
origin.id = provenance.origin_get_id(origin)
- while stack:
- relative, current = stack.pop()
-
- # Check if current revision has no preferred origin and update if necessary.
- preferred = provenance.revision_get_preferred_origin(current)
-
- if preferred is None:
- provenance.revision_set_preferred_origin(origin, current)
- ########################################################################
-
- if relative is None:
- # This revision is pointed directly by the origin.
- visited = provenance.revision_visited(current)
- provenance.revision_add_to_origin(origin, current)
+ # head is treated separately since it should always be added to the given origin
+ head = graph.entry
+ check_preferred_origin(provenance, origin, head)
+ provenance.revision_add_to_origin(origin, head)
- if not visited:
- stack.append((current, current))
+ # head's history should be recursively iterated starting from its parents
+ stack = list(graph.parents)
+ while stack:
+ current = stack.pop()
+ check_preferred_origin(provenance, origin, current.entry)
+ if current.visited:
+ # if current revision was already visited just add it to the current origin
+ # and stop recursion (its history has already been flattened)
+ provenance.revision_add_to_origin(origin, current.entry)
else:
- # This revision is a parent of another one in the history of the
- # relative revision.
- current.retrieve_parents(archive)
+ # if current revision was not visited before create a link between it and
+ # the head, and recursively walk its history
+ provenance.revision_add_before_revision(head, current.entry)
for parent in current.parents:
- visited = provenance.revision_visited(parent)
-
- if not visited:
- # The parent revision has never been seen before pointing
- # directly to an origin.
- known = provenance.revision_in_history(parent)
-
- if known:
- # The parent revision is already known in some other
- # revision's history. We should point it directly to
- # the origin and (eventually) walk its history.
- stack.append((None, parent))
- else:
- # The parent revision was never seen before. We should
- # walk its history and associate it with the same
- # relative revision.
- provenance.revision_add_before_revision(relative, parent)
- stack.append((relative, parent))
- else:
- # The parent revision already points to an origin, so its
- # history was properly processed before. We just need to
- # make sure it points to the current origin as well.
- provenance.revision_add_to_origin(origin, parent)
+ stack.append(parent)
+
+
+def check_preferred_origin(
+ provenance: ProvenanceInterface,
+ origin: OriginEntry,
+ revision: RevisionEntry,
+):
+ # if the revision has no preferred origin just set the given origin as the
+ # preferred one. TODO: this should be improved in the future!
+ preferred = provenance.revision_get_preferred_origin(revision)
+ if preferred is None:
+ provenance.revision_set_preferred_origin(origin, revision)
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -7,7 +7,7 @@
import iso8601
-from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.hashutil import hash_to_bytes
from .archive import ArchiveInterface
from .graph import IsochroneNode, build_isochrone_graph
@@ -71,7 +71,7 @@
date = provenance.revision_get_early_date(revision)
if date is None or revision.date < date:
logging.debug(
- f"Processing revisions {hash_to_hex(revision.id)}"
+ f"Processing revisions {revision.id.hex()}"
f" (known date {date} / revision date {revision.date})..."
)
graph = build_isochrone_graph(
@@ -95,11 +95,11 @@
provenance.commit()
stop = time.time()
logging.debug(
- f"Revisions {';'.join([hash_to_hex(revision.id) for revision in revisions])} "
+ f"Revisions {';'.join([revision.id.hex() for revision in revisions])} "
f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
# logging.critical(
- # ";".join([hash_to_hex(revision.id) for revision in revisions])
+ # ";".join([revision.id.hex() for revision in revisions])
# + f",{stop - start},{stop - done}"
# )
diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py
--- a/swh/provenance/tests/test_isochrone_graph.py
+++ b/swh/provenance/tests/test_isochrone_graph.py
@@ -40,9 +40,9 @@
node.known = d.get("known", False)
node.invalid = d.get("invalid", False)
node.path = bytes(d["path"], encoding="utf-8")
- node.children = [
+ node.children = set(
isochrone_graph_from_dict(child, depth=depth + 1) for child in children
- ]
+ )
return node

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 2:48 PM (5 d, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3235340

Event Timeline