Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342319
D5847.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
D5847.diff
View Options
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -1,7 +1,7 @@
from datetime import datetime
import itertools
import logging
-from typing import Any, Dict, Iterable, Optional, Set
+from typing import Any, Dict, Iterable, Optional
import psycopg2
import psycopg2.extras
@@ -23,7 +23,6 @@
# XXX: not sure this is the best place to do it!
self.cursor.execute("SET timezone TO 'UTC'")
self.insert_cache: Dict[str, Any] = {}
- self.remove_cache: Dict[str, Set[bytes]] = {}
self.select_cache: Dict[str, Any] = {}
self.clear_caches()
@@ -38,7 +37,6 @@
"revision_before_rev": list(),
"revision_in_org": list(),
}
- self.remove_cache = {"directory": set()}
self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()}
def commit(self):
@@ -108,7 +106,7 @@
) -> Optional[datetime]:
# First check if the date is being modified by current transection.
date = self.insert_cache["directory"].get(directory.id, None)
- if date is None and directory.id not in self.remove_cache["directory"]:
+ if date is None:
# If not, check whether it's been query before
date = self.select_cache["directory"].get(directory.id, None)
if date is None:
@@ -131,7 +129,7 @@
date = self.insert_cache["directory"].get(directory.id, None)
if date is not None:
dates[directory.id] = date
- elif directory.id not in self.remove_cache["directory"]:
+ else:
# If not, check whether it's been query before
date = self.select_cache["directory"].get(directory.id, None)
if date is not None:
@@ -150,15 +148,10 @@
self.select_cache["directory"][sha1] = date
return dates
- def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
- self.remove_cache["directory"].add(directory.id)
- self.insert_cache["directory"].pop(directory.id, None)
-
def directory_set_date_in_isochrone_frontier(
self, directory: DirectoryEntry, date: datetime
):
self.insert_cache["directory"][directory.id] = date
- self.remove_cache["directory"].discard(directory.id)
def insert_all(self):
# Performe insertions with cached information
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -69,11 +69,6 @@
) -> Dict[bytes, datetime]:
...
- def directory_invalidate_in_isochrone_frontier(
- self, directory: DirectoryEntry
- ) -> None:
- ...
-
def directory_set_date_in_isochrone_frontier(
self, directory: DirectoryEntry, date: datetime
) -> None:
@@ -217,6 +212,7 @@
trackall: bool = True,
lower: bool = True,
mindepth: int = 1,
+ commit: bool = True,
) -> None:
start = time.time()
for revision in revisions:
@@ -246,14 +242,15 @@
mindepth=mindepth,
)
done = time.time()
- # TODO: improve this! Maybe using a max attempt counter?
- # Ideally Provenance class should guarantee that a commit never fails.
- while not provenance.commit():
- logging.warning(
- "Could not commit revisions "
- + ";".join([hash_to_hex(revision.id) for revision in revisions])
- + ". Retrying..."
- )
+ if commit:
+ # TODO: improve this! Maybe using a max attempt counter?
+ # Ideally Provenance class should guarantee that a commit never fails.
+ while not provenance.commit():
+ logging.warning(
+ "Could not commit revisions "
+ + ";".join([hash_to_hex(revision.id) for revision in revisions])
+ + ". Retrying..."
+ )
stop = time.time()
logging.debug(
f"Revisions {';'.join([hash_to_hex(revision.id) for revision in revisions])} "
@@ -285,7 +282,8 @@
# known is True if this node is already known in the db; either because
# the current directory actually exists in the database, or because all
# the content of the current directory is known (subdirectories and files)
- self.known: bool = self.dbdate is not None
+ self.known = self.dbdate is not None
+ self.invalid = False
self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name
self.children: List[IsochroneNode] = []
@@ -298,6 +296,7 @@
self._dbdate = None
self.maxdate = None
self.known = False
+ self.invalid = True
def add_directory(
self, child: DirectoryEntry, date: Optional[datetime] = None
@@ -311,9 +310,8 @@
def __str__(self):
return (
- f"<{self.entry}: "
- f"known={self.known}, maxdate={self.maxdate}, "
- f"dbdate={self.dbdate}, path={self.path}, "
+ f"<{self.entry}: dbdate={self.dbdate}, maxdate={self.maxdate}, "
+ f"known={self.known}, invalid={self.invalid}, path={self.path}, "
f"children=[{', '.join(str(child) for child in self.children)}]>"
)
@@ -326,6 +324,7 @@
self._dbdate,
self.maxdate,
self.known,
+ self.invalid,
self.path,
)
== (
@@ -334,6 +333,7 @@
other._dbdate,
other.maxdate,
other.known,
+ other.invalid,
other.path,
)
and Counter(self.children) == Counter(other.children)
@@ -341,7 +341,15 @@
def __hash__(self):
return hash(
- (self.entry, self.depth, self._dbdate, self.maxdate, self.known, self.path)
+ (
+ self.entry,
+ self.depth,
+ self._dbdate,
+ self.maxdate,
+ self.known,
+ self.invalid,
+ self.path,
+ )
)
@@ -386,7 +394,6 @@
f" when processing revision {hash_to_hex(revision.id)}"
f" (date {revision.date})"
)
- provenance.directory_invalidate_in_isochrone_frontier(current.entry)
current.invalidate()
# Pre-query all known dates for directories in the current directory
@@ -431,7 +438,7 @@
# all the files and directories under current have a maxdate,
# we can infer the maxdate for current directory
assert current.maxdate is None
- # If all content is already known, update current directory info.
+ # if all content is already known, update current directory info.
current.maxdate = max(
[UTCMIN]
+ [
@@ -444,14 +451,21 @@
for file in current.entry.files
]
)
- current.known = (
- # true if all subdirectories are known
- all(child.known for child in current.children)
- # true if all files are in fdates, i.e. if all files were known
- # *before building this isochrone graph node*
- # Note: the 'all()' is lazy: will stop iterating as soon as possible
- and all((file.id in fdates) for file in current.entry.files)
- )
+ if current.maxdate <= revision.date:
+ current.known = (
+ # true if all subdirectories are known
+ all(child.known for child in current.children)
+ # true if all files are in fdates, i.e. if all files were known
+ # *before building this isochrone graph node*
+ # Note: the 'all()' is lazy: will stop iterating as soon as
+ # possible
+ and all((file.id in fdates) for file in current.entry.files)
+ )
+ else:
+ # at least one content is being processed out-of-order, then current
+ # node should be treated as unknown
+ current.maxdate = revision.date
+ current.known = False
logging.debug(
f"Maxdates for revision {hash_to_hex(revision.id)} successfully computed!"
)
@@ -482,6 +496,7 @@
revision, current.entry, current.path
)
else:
+ assert current.maxdate is not None
# Current directory is not an outer isochrone frontier for any previous
# revision. It might be eligible for this one.
if is_new_frontier(
@@ -491,7 +506,6 @@
lower=lower,
mindepth=mindepth,
):
- assert current.maxdate is not None
# Outer frontier should be moved to current position in the isochrone
# graph. This is the first time this directory is found in the isochrone
# frontier.
@@ -504,6 +518,12 @@
)
flatten_directory(archive, provenance, current.entry)
else:
+ # If current node is an invalidated frontier, update its date for future
+ # revisions to get the proper value.
+ if current.invalid:
+ provenance.directory_set_date_in_isochrone_frontier(
+ current.entry, current.maxdate
+ )
# No point moving the frontier here. Either there are no files or they
# are being seen for the first time here. Add all blobs to current
# revision updating date if necessary, and recursively analyse
diff --git a/swh/provenance/tests/data/graphs_out-of-order_lower_1.yaml b/swh/provenance/tests/data/graphs_out-of-order_lower_1.yaml
--- a/swh/provenance/tests/data/graphs_out-of-order_lower_1.yaml
+++ b/swh/provenance/tests/data/graphs_out-of-order_lower_1.yaml
@@ -134,29 +134,26 @@
entry:
id: "b3cf11b22c9f93c3c494cf90ab072f394155072d"
name: ""
- maxdate: 1000000010.0
- known: True # TODO: analyse this, as it might be a source of issues!
+ maxdate: 1000000005.0
path: ""
children:
- entry:
id: "baca735bf8b8720131b4bfdb47c51631a9260348"
name: "A"
- maxdate: 1000000010.0
- known: True
+ maxdate: 1000000005.0
path: "A"
children:
- entry:
id: "4b28979d88ed209a09c272bcc80f69d9b18339c2"
name: "B"
- maxdate: 1000000010.0
- known: True
+ maxdate: 1000000005.0
path: "A/B"
children:
- entry:
id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf"
name: "C"
- maxdate: 1000000010.0
- known: True
+ maxdate: 1000000005.0
+ invalid: True
path: "A/B/C"
# Isochrone graph for R06
- rev: "53519b5a5e8cf12a4f81f82e489f95c1d04d5314"
@@ -182,7 +179,7 @@
- entry:
id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf"
name: "C"
- dbdate: 1000000010.0
- maxdate: 1000000010.0
+ dbdate: 1000000005.0
+ maxdate: 1000000005.0
known: True
path: "A/B/C"
diff --git a/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt b/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt
--- a/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt
+++ b/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt
@@ -33,10 +33,6 @@
1000000050 53519b5a5e8cf12a4f81f82e489f95c1d04d5314 R06
R06 | | | R 53519b5a5e8cf12a4f81f82e489f95c1d04d5314 | 1000000050
| R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0
-# Note the ts below (-40) is NOT the same as the maxdate of its content (-45)!
-# This is because the ts of the existing frontier (the R-D below) has not been updated by the
-# "new" version of the b file (aka older ts) of R05.
-# /!\ This is true only when ingesting revisions one at a time!
- | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -40
+ | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -45
| D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -50
| D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -45
diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py
--- a/swh/provenance/tests/test_isochrone_graph.py
+++ b/swh/provenance/tests/test_isochrone_graph.py
@@ -37,6 +37,7 @@
)
node.maxdate = datetime.fromtimestamp(d["maxdate"], timezone.utc)
node.known = d.get("known", False)
+ node.invalid = d.get("invalid", False)
node.path = bytes(d["path"], encoding="utf-8")
node.children = [
isochrone_graph_from_dict(child, depth=depth + 1) for child in children
@@ -54,7 +55,10 @@
("out-of-order", True, 1),
),
)
-def test_isochrone_graph(provenance, swh_storage, archive, repo, lower, mindepth):
+@pytest.mark.parametrize("batch", (True, False))
+def test_isochrone_graph(
+ provenance, swh_storage, archive, repo, lower, mindepth, batch
+):
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
@@ -64,6 +68,7 @@
with open(get_datafile(filename)) as file:
for expected in yaml.full_load(file):
+ print("# Processing revision", expected["rev"])
revision = revisions[hash_to_bytes(expected["rev"])]
entry = RevisionEntry(
id=revision["id"],
@@ -71,7 +76,7 @@
root=revision["directory"],
)
expected_graph = isochrone_graph_from_dict(expected["graph"])
- print("Expected", expected_graph)
+ print("Expected graph:", expected_graph)
# Create graph for current revision and check it has the expected structure.
computed_graph = build_isochrone_graph(
@@ -80,9 +85,16 @@
entry,
DirectoryEntry(entry.root),
)
- print("Computed", computed_graph)
+ print("Computed graph:", computed_graph)
assert computed_graph == expected_graph
# Add current revision so that provenance info is kept up to date for the
# following ones.
- revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
+ revision_add(
+ provenance,
+ archive,
+ [entry],
+ lower=lower,
+ mindepth=mindepth,
+ commit=not batch,
+ )
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -198,8 +198,9 @@
("out-of-order", True, 1),
),
)
+@pytest.mark.parametrize("batch", (True, False))
def test_provenance_heuristics_content_find_all(
- provenance, swh_storage, archive, repo, lower, mindepth
+ provenance, swh_storage, archive, repo, lower, mindepth, batch
):
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
@@ -213,11 +214,13 @@
for revision in data["revision"]
]
- # XXX adding all revisions at once should be working just fine, but it does not...
- # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
- # ...so add revisions one at a time for now
- for revision in revisions:
- revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)
+ if batch:
+ revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
+ else:
+ for revision in revisions:
+ revision_add(
+ provenance, archive, [revision], lower=lower, mindepth=mindepth
+ )
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 12:37 PM (2 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3213776
Attached To
D5847: Improve out-of-order revision processing
Event Timeline
Log In to Comment