Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/revision.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import os | |||||
from typing import Generator, Iterable, Iterator, List, Optional, Tuple | from typing import Generator, Iterable, Iterator, List, Optional, Tuple | ||||
from swh.core.statsd import statsd | from swh.core.statsd import statsd | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .directory import directory_flatten | |||||
from .graph import IsochroneNode, build_isochrone_graph | from .graph import IsochroneNode, build_isochrone_graph | ||||
from .interface import ProvenanceInterface | from .interface import ProvenanceInterface | ||||
from .model import DirectoryEntry, RevisionEntry | from .model import DirectoryEntry, RevisionEntry | ||||
REVISION_DURATION_METRIC = "swh_provenance_revision_content_layer_duration_seconds" | REVISION_DURATION_METRIC = "swh_provenance_revision_content_layer_duration_seconds" | ||||
class CSVRevisionIterator: | class CSVRevisionIterator: | ||||
▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines | while stack: | ||||
# frontier. | # frontier. | ||||
provenance.directory_set_date_in_isochrone_frontier( | provenance.directory_set_date_in_isochrone_frontier( | ||||
current.entry, current.maxdate | current.entry, current.maxdate | ||||
) | ) | ||||
if trackall: | if trackall: | ||||
provenance.directory_add_to_revision( | provenance.directory_add_to_revision( | ||||
revision, current.entry, current.path | revision, current.entry, current.path | ||||
) | ) | ||||
flatten_directory( | directory_flatten( | ||||
provenance, archive, current.entry, minsize=minsize | provenance, archive, current.entry, minsize=minsize | ||||
) | ) | ||||
else: | else: | ||||
# If current node is an invalidated frontier, update its date for future | # If current node is an invalidated frontier, update its date for future | ||||
# revisions to get the proper value. | # revisions to get the proper value. | ||||
if current.invalid: | if current.invalid: | ||||
provenance.directory_set_date_in_isochrone_frontier( | provenance.directory_set_date_in_isochrone_frontier( | ||||
current.entry, current.maxdate | current.entry, current.maxdate | ||||
) | ) | ||||
# No point moving the frontier here. Either there are no files or they | # No point moving the frontier here. Either there are no files or they | ||||
# are being seen for the first time here. Add all blobs to current | # are being seen for the first time here. Add all blobs to current | ||||
# revision updating date if necessary, and recursively analyse | # revision updating date if necessary, and recursively analyse | ||||
# subdirectories as candidates to the outer frontier. | # subdirectories as candidates to the outer frontier. | ||||
for blob in current.entry.files: | for blob in current.entry.files: | ||||
date = provenance.content_get_early_date(blob) | date = provenance.content_get_early_date(blob) | ||||
if date is None or revision.date < date: | if date is None or revision.date < date: | ||||
provenance.content_set_early_date(blob, revision.date) | provenance.content_set_early_date(blob, revision.date) | ||||
provenance.content_add_to_revision(revision, blob, current.path) | provenance.content_add_to_revision(revision, blob, current.path) | ||||
for child in current.children: | for child in current.children: | ||||
stack.append(child) | stack.append(child) | ||||
@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten_directory"}) | |||||
def flatten_directory( | |||||
provenance: ProvenanceInterface, | |||||
archive: ArchiveInterface, | |||||
directory: DirectoryEntry, | |||||
minsize: int = 0, | |||||
) -> None: | |||||
"""Recursively retrieve all the files of 'directory' and insert them in the | |||||
'provenance' database in the 'content_to_directory' table. | |||||
""" | |||||
stack = [(directory, b"")] | |||||
while stack: | |||||
current, prefix = stack.pop() | |||||
current.retrieve_children(archive, minsize=minsize) | |||||
for f_child in current.files: | |||||
# Add content to the directory with the computed prefix. | |||||
provenance.content_add_to_directory(directory, f_child, prefix) | |||||
for d_child in current.dirs: | |||||
# Recursively walk the child directory. | |||||
stack.append((d_child, os.path.join(prefix, d_child.name))) | |||||
def is_new_frontier( | def is_new_frontier( | ||||
node: IsochroneNode, | node: IsochroneNode, | ||||
revision: RevisionEntry, | revision: RevisionEntry, | ||||
trackall: bool = True, | trackall: bool = True, | ||||
lower: bool = True, | lower: bool = True, | ||||
mindepth: int = 1, | mindepth: int = 1, | ||||
) -> bool: | ) -> bool: | ||||
assert node.maxdate is not None # for mypy | assert node.maxdate is not None # for mypy | ||||
▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines |