Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9344789
D5886.id21177.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D5886.id21177.diff
View Options
diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py
--- a/swh/provenance/graph.py
+++ b/swh/provenance/graph.py
@@ -1,10 +1,7 @@
-from collections import Counter
from datetime import datetime, timezone
import logging
import os
-from typing import Dict, List, Optional
-
-from swh.model.hashutil import hash_to_hex
+from typing import Dict, Optional, Set
from .archive import ArchiveInterface
from .model import DirectoryEntry, RevisionEntry
@@ -13,6 +10,73 @@
UTCMIN = datetime.min.replace(tzinfo=timezone.utc)
+class HistoryNode:
+ def __init__(
+ self, entry: RevisionEntry, visited: bool = False, in_history: bool = False
+ ):
+ self.entry = entry
+ # A revision is `visited` if it is directly pointed by an origin (ie. a head
+ # revision for some snapshot)
+ self.visited = visited
+ # A revision is `in_history` if it appears in the history graph of an already
+ # processed revision in the provenance database
+ self.in_history = in_history
+ self.parents: Set[HistoryNode] = set()
+
+ def add_parent(
+ self, parent: RevisionEntry, visited: bool = False, in_history: bool = False
+ ) -> "HistoryNode":
+ node = HistoryNode(parent, visited=visited, in_history=in_history)
+ self.parents.add(node)
+ return node
+
+ def __str__(self):
+ return (
+ f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, "
+ f"parents=[{', '.join(str(parent) for parent in self.parents)}]>"
+ )
+
+ def __eq__(self, other):
+ return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__
+
+ def __hash__(self):
+ return hash((self.entry, self.visited, self.in_history))
+
+
+def build_history_graph(
+ archive: ArchiveInterface,
+ provenance: ProvenanceInterface,
+ revision: RevisionEntry,
+) -> HistoryNode:
+ """Recursively build the history graph from the given revision"""
+
+ root = HistoryNode(
+ revision,
+ visited=provenance.revision_visited(revision),
+ in_history=provenance.revision_in_history(revision),
+ )
+ stack = [root]
+ logging.debug(
+ f"Recursively creating history graph for revision {revision.id.hex()}..."
+ )
+ while stack:
+ current = stack.pop()
+ if not current.visited:
+ current.entry.retrieve_parents(archive)
+
+ for rev in current.entry.parents:
+ node = current.add_parent(
+ rev,
+ visited=provenance.revision_visited(rev),
+ in_history=provenance.revision_in_history(rev),
+ )
+ stack.append(node)
+ logging.debug(
+ f"History graph for revision {revision.id.hex()} successfully created!"
+ )
+ return root
+
+
class IsochroneNode:
def __init__(
self,
@@ -36,7 +100,7 @@
self.known = self.dbdate is not None
self.invalid = False
self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name
- self.children: List[IsochroneNode] = []
+ self.children: Set[IsochroneNode] = set()
@property
def dbdate(self):
@@ -52,56 +116,27 @@
def add_directory(
self, child: DirectoryEntry, date: Optional[datetime] = None
) -> "IsochroneNode":
- # we should not be processing this node (ie add subdirectories or
- # files) if it's actually known by the provenance DB
+ # we should not be processing this node (ie add subdirectories or files) if it's
+ # actually known by the provenance DB
assert self.dbdate is None
node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path)
- self.children.append(node)
+ self.children.add(node)
return node
def __str__(self):
return (
- f"<{self.entry}: dbdate={self.dbdate}, maxdate={self.maxdate}, "
+ f"<{self.entry}: depth={self.depth}, "
+ f"dbdate={self.dbdate}, maxdate={self.maxdate}, "
f"known={self.known}, invalid={self.invalid}, path={self.path}, "
f"children=[{', '.join(str(child) for child in self.children)}]>"
)
def __eq__(self, other):
- return (
- isinstance(other, IsochroneNode)
- and (
- self.entry,
- self.depth,
- self._dbdate,
- self.maxdate,
- self.known,
- self.invalid,
- self.path,
- )
- == (
- other.entry,
- other.depth,
- other._dbdate,
- other.maxdate,
- other.known,
- other.invalid,
- other.path,
- )
- and Counter(self.children) == Counter(other.children)
- )
+ return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__
def __hash__(self):
- return hash(
- (
- self.entry,
- self.depth,
- self._dbdate,
- self.maxdate,
- self.known,
- self.invalid,
- self.path,
- )
- )
+ # only immutable attributes are considered to compute hash
+ return hash((self.entry, self.depth, self.path))
def build_isochrone_graph(
@@ -129,7 +164,7 @@
root = IsochroneNode(directory, dbdate=root_date)
stack = [root]
logging.debug(
- f"Recursively creating graph for revision {hash_to_hex(revision.id)}..."
+ f"Recursively creating isochrone graph for revision {revision.id.hex()}..."
)
fdates: Dict[bytes, datetime] = {} # map {file_id: date}
while stack:
@@ -140,9 +175,9 @@
# the revision is being processed out of order.
if current.dbdate is not None and current.dbdate > revision.date:
logging.debug(
- f"Invalidating frontier on {hash_to_hex(current.entry.id)}"
+ f"Invalidating frontier on {current.entry.id.hex()}"
f" (date {current.dbdate})"
- f" when processing revision {hash_to_hex(revision.id)}"
+ f" when processing revision {revision.id.hex()}"
f" (date {revision.date})"
)
current.invalidate()
@@ -162,11 +197,11 @@
fdates.update(provenance.content_get_early_dates(current.entry.files))
logging.debug(
- f"Isochrone graph for revision {hash_to_hex(revision.id)} successfully created!"
+ f"Isochrone graph for revision {revision.id.hex()} successfully created!"
)
# Precalculate max known date for each node in the graph (only directory nodes are
# pushed to the stack).
- logging.debug(f"Computing maxdates for revision {hash_to_hex(revision.id)}...")
+ logging.debug(f"Computing maxdates for revision {revision.id.hex()}...")
stack = [root]
while stack:
@@ -217,7 +252,5 @@
# node should be treated as unknown
current.maxdate = revision.date
current.known = False
- logging.debug(
- f"Maxdates for revision {hash_to_hex(revision.id)} successfully computed!"
- )
+ logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!")
return root
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -6,9 +6,8 @@
import iso8601
-from swh.model.hashutil import hash_to_hex
-
from .archive import ArchiveInterface
+from .graph import HistoryNode, build_history_graph
from .model import OriginEntry, RevisionEntry
from .provenance import ProvenanceInterface
@@ -48,76 +47,60 @@
provenance: ProvenanceInterface,
archive: ArchiveInterface,
origins: List[OriginEntry],
-) -> None:
+):
start = time.time()
for origin in origins:
origin.retrieve_revisions(archive)
for revision in origin.revisions:
- origin_add_revision(provenance, archive, origin, revision)
+ graph = build_history_graph(archive, provenance, revision)
+ origin_add_revision(provenance, origin, graph)
done = time.time()
provenance.commit()
stop = time.time()
logging.debug(
"Origins "
- ";".join(
- [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins]
- )
+ ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins])
+ f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
def origin_add_revision(
provenance: ProvenanceInterface,
- archive: ArchiveInterface,
origin: OriginEntry,
- revision: RevisionEntry,
-) -> None:
- stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)]
+ graph: HistoryNode,
+):
origin.id = provenance.origin_get_id(origin)
- while stack:
- relative, current = stack.pop()
-
- # Check if current revision has no preferred origin and update if necessary.
- preferred = provenance.revision_get_preferred_origin(current)
-
- if preferred is None:
- provenance.revision_set_preferred_origin(origin, current)
- ########################################################################
-
- if relative is None:
- # This revision is pointed directly by the origin.
- visited = provenance.revision_visited(current)
- provenance.revision_add_to_origin(origin, current)
+ # head is treated separately since it should always be added to the given origin
+ head = graph.entry
+ check_preferred_origin(provenance, origin, head)
+ provenance.revision_add_to_origin(origin, head)
- if not visited:
- stack.append((current, current))
+ # head's history should be recursively iterated starting from its parents
+ stack = list(graph.parents)
+ while stack:
+ current = stack.pop()
+ check_preferred_origin(provenance, origin, current.entry)
+ if current.visited:
+ # if current revision was already visited just add it to the current origin
+ # and stop recursion (its history has already been flattened)
+ provenance.revision_add_to_origin(origin, current.entry)
else:
- # This revision is a parent of another one in the history of the
- # relative revision.
- current.retrieve_parents(archive)
+ # if current revision was not visited before create a link between it and
+ # the head, and recursively walk its history
+ provenance.revision_add_before_revision(head, current.entry)
for parent in current.parents:
- visited = provenance.revision_visited(parent)
-
- if not visited:
- # The parent revision has never been seen before pointing
- # directly to an origin.
- known = provenance.revision_in_history(parent)
-
- if known:
- # The parent revision is already known in some other
- # revision's history. We should point it directly to
- # the origin and (eventually) walk its history.
- stack.append((None, parent))
- else:
- # The parent revision was never seen before. We should
- # walk its history and associate it with the same
- # relative revision.
- provenance.revision_add_before_revision(relative, parent)
- stack.append((relative, parent))
- else:
- # The parent revision already points to an origin, so its
- # history was properly processed before. We just need to
- # make sure it points to the current origin as well.
- provenance.revision_add_to_origin(origin, parent)
+ stack.append(parent)
+
+
+def check_preferred_origin(
+ provenance: ProvenanceInterface,
+ origin: OriginEntry,
+ revision: RevisionEntry,
+):
+ # if the revision has no preferred origin just set the given origin as the
+ # preferred one. TODO: this should be improved in the future!
+ preferred = provenance.revision_get_preferred_origin(revision)
+ if preferred is None:
+ provenance.revision_set_preferred_origin(origin, revision)
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -7,7 +7,7 @@
import iso8601
-from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.hashutil import hash_to_bytes
from .archive import ArchiveInterface
from .graph import IsochroneNode, build_isochrone_graph
@@ -71,7 +71,7 @@
date = provenance.revision_get_early_date(revision)
if date is None or revision.date < date:
logging.debug(
- f"Processing revisions {hash_to_hex(revision.id)}"
+ f"Processing revisions {revision.id.hex()}"
f" (known date {date} / revision date {revision.date})..."
)
graph = build_isochrone_graph(
@@ -95,11 +95,11 @@
provenance.commit()
stop = time.time()
logging.debug(
- f"Revisions {';'.join([hash_to_hex(revision.id) for revision in revisions])} "
+ f"Revisions {';'.join([revision.id.hex() for revision in revisions])} "
f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
# logging.critical(
- # ";".join([hash_to_hex(revision.id) for revision in revisions])
+ # ";".join([revision.id.hex() for revision in revisions])
# + f",{stop - start},{stop - done}"
# )
diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py
--- a/swh/provenance/tests/test_isochrone_graph.py
+++ b/swh/provenance/tests/test_isochrone_graph.py
@@ -40,9 +40,9 @@
node.known = d.get("known", False)
node.invalid = d.get("invalid", False)
node.path = bytes(d["path"], encoding="utf-8")
- node.children = [
+ node.children = set(
isochrone_graph_from_dict(child, depth=depth + 1) for child in children
- ]
+ )
return node
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 2:48 PM (5 d, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3235340
Attached To
D5886: Refactor origin-revision layer
Event Timeline
Log In to Comment