Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/revision.py
Show All 9 Lines | |||||
from swh.core.statsd import statsd | from swh.core.statsd import statsd | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .graph import IsochroneNode, build_isochrone_graph | from .graph import IsochroneNode, build_isochrone_graph | ||||
from .interface import ProvenanceInterface | from .interface import ProvenanceInterface | ||||
from .model import DirectoryEntry, RevisionEntry | from .model import DirectoryEntry, RevisionEntry | ||||
REVISION_DURATION_METRIC = "swh_provenance_revision_content_layer_duration_seconds" | |||||
class CSVRevisionIterator: | class CSVRevisionIterator: | ||||
"""Iterator over revisions typically present in the given CSV file. | """Iterator over revisions typically present in the given CSV file. | ||||
The input is an iterator that produces 3 elements per row: | The input is an iterator that produces 3 elements per row: | ||||
(id, date, root) | (id, date, root) | ||||
Show All 18 Lines | class CSVRevisionIterator: | ||||
def __iter__(self) -> Generator[RevisionEntry, None, None]: | def __iter__(self) -> Generator[RevisionEntry, None, None]: | ||||
for id, date, root in self.revisions: | for id, date, root in self.revisions: | ||||
if date.tzinfo is None: | if date.tzinfo is None: | ||||
date = date.replace(tzinfo=timezone.utc) | date = date.replace(tzinfo=timezone.utc) | ||||
yield RevisionEntry(id, date=date, root=root) | yield RevisionEntry(id, date=date, root=root) | ||||
@statsd.timed( | @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"}) | ||||
metric="swh_provenance_revision_content_layer_accesstime_seconds", | |||||
tags={"method": "main"}, | |||||
) | |||||
def revision_add( | def revision_add( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
revisions: List[RevisionEntry], | revisions: List[RevisionEntry], | ||||
trackall: bool = True, | trackall: bool = True, | ||||
lower: bool = True, | lower: bool = True, | ||||
mindepth: int = 1, | mindepth: int = 1, | ||||
commit: bool = True, | commit: bool = True, | ||||
Show All 19 Lines | for revision in revisions: | ||||
trackall=trackall, | trackall=trackall, | ||||
lower=lower, | lower=lower, | ||||
mindepth=mindepth, | mindepth=mindepth, | ||||
) | ) | ||||
if commit: | if commit: | ||||
provenance.flush() | provenance.flush() | ||||
@statsd.timed( | @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "process_content"}) | ||||
metric="swh_provenance_revision_content_layer_accesstime_seconds", | |||||
tags={"method": "process_content"}, | |||||
) | |||||
def revision_process_content( | def revision_process_content( | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
revision: RevisionEntry, | revision: RevisionEntry, | ||||
graph: IsochroneNode, | graph: IsochroneNode, | ||||
trackall: bool = True, | trackall: bool = True, | ||||
lower: bool = True, | lower: bool = True, | ||||
mindepth: int = 1, | mindepth: int = 1, | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | while stack: | ||||
date = provenance.content_get_early_date(blob) | date = provenance.content_get_early_date(blob) | ||||
if date is None or revision.date < date: | if date is None or revision.date < date: | ||||
provenance.content_set_early_date(blob, revision.date) | provenance.content_set_early_date(blob, revision.date) | ||||
provenance.content_add_to_revision(revision, blob, current.path) | provenance.content_add_to_revision(revision, blob, current.path) | ||||
for child in current.children: | for child in current.children: | ||||
stack.append(child) | stack.append(child) | ||||
@statsd.timed( | @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten_directory"}) | ||||
metric="swh_provenance_revision_content_layer_accesstime_seconds", | |||||
tags={"method": "flatten_directory"}, | |||||
) | |||||
def flatten_directory( | def flatten_directory( | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
directory: DirectoryEntry, | directory: DirectoryEntry, | ||||
) -> None: | ) -> None: | ||||
"""Recursively retrieve all the files of 'directory' and insert them in the | """Recursively retrieve all the files of 'directory' and insert them in the | ||||
'provenance' database in the 'content_to_directory' table. | 'provenance' database in the 'content_to_directory' table. | ||||
""" | """ | ||||
Show All 14 Lines | def is_new_frontier( | ||||
revision: RevisionEntry, | revision: RevisionEntry, | ||||
trackall: bool = True, | trackall: bool = True, | ||||
lower: bool = True, | lower: bool = True, | ||||
mindepth: int = 1, | mindepth: int = 1, | ||||
) -> bool: | ) -> bool: | ||||
assert node.maxdate is not None # for mypy | assert node.maxdate is not None # for mypy | ||||
assert revision.date is not None # idem | assert revision.date is not None # idem | ||||
if trackall: | if trackall: | ||||
# The only real condition for a directory to be a frontier is that its | # The only real condition for a directory to be a frontier is that its content | ||||
# content is already known and its maxdate is less (or equal) than | # is already known and its maxdate is less (or equal) than current revision's | ||||
# current revision's date. Checking mindepth is meant to skip root | # date. Checking mindepth is meant to skip root directories (or any arbitrary | ||||
# directories (or any arbitrary depth) to improve the result. The | # depth) to improve the result. The option lower tries to maximize the reuse | ||||
# option lower tries to maximize the reusage rate of previously defined | # rate of previously defined frontiers by keeping them low in the directory | ||||
# frontiers by keeping them low in the directory tree. | # tree. | ||||
return ( | return ( | ||||
node.known | node.known | ||||
and node.maxdate <= revision.date # all content is earlier than revision | and node.maxdate <= revision.date # all content is earlier than revision | ||||
and node.depth | and node.depth | ||||
>= mindepth # current node is deeper than the min allowed depth | >= mindepth # current node is deeper than the min allowed depth | ||||
and (has_blobs(node) if lower else True) # there is at least one blob in it | and (has_blobs(node) if lower else True) # there is at least one blob in it | ||||
) | ) | ||||
else: | else: | ||||
# If we are only tracking first occurrences, we want to ensure that all first | # If we are only tracking first occurrences, we want to ensure that all first | ||||
# occurrences end up in the content_early_in_rev relation. Thus, we force for | # occurrences end up in the content_early_in_rev relation. Thus, we force for | ||||
# every blob outside a frontier to have an extrictly earlier date. | # every blob outside a frontier to have an strictly earlier date. | ||||
return ( | return ( | ||||
node.maxdate < revision.date # all content is earlier than revision | node.maxdate < revision.date # all content is earlier than revision | ||||
and node.depth >= mindepth # deeper than the min allowed depth | and node.depth >= mindepth # deeper than the min allowed depth | ||||
and (has_blobs(node) if lower else True) # there is at least one blob | and (has_blobs(node) if lower else True) # there is at least one blob | ||||
) | ) | ||||
def has_blobs(node: IsochroneNode) -> bool: | def has_blobs(node: IsochroneNode) -> bool: | ||||
Show All 29 Lines |