Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/provenance.py
import os | |||||
from datetime import datetime | from datetime import datetime | ||||
import os | |||||
from typing import Dict, Generator, List, Optional, Tuple | from typing import Dict, Generator, List, Optional, Tuple | ||||
from typing_extensions import Protocol, runtime_checkable | |||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .model import DirectoryEntry, FileEntry, TreeEntry | from .model import DirectoryEntry, FileEntry, TreeEntry | ||||
from .origin import OriginEntry | from .origin import OriginEntry | ||||
from .revision import RevisionEntry | from .revision import RevisionEntry | ||||
# TODO: consider moving to path utils file together with normalize. | # TODO: consider moving to path utils file together with normalize. | ||||
def is_child(path: bytes, prefix: bytes) -> bool: | def is_child(path: bytes, prefix: bytes) -> bool: | ||||
return path != prefix and os.path.dirname(path) == prefix | return path != prefix and os.path.dirname(path) == prefix | ||||
class ProvenanceInterface: | @runtime_checkable | ||||
def __init__(self, **kwargs): | class ProvenanceInterface(Protocol): | ||||
raise NotImplementedError | |||||
def commit(self): | def commit(self): | ||||
raise NotImplementedError | """Commit currently ongoing transactions in the backend DB""" | ||||
... | |||||
def content_add_to_directory( | def content_add_to_directory( | ||||
self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes | self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def content_add_to_revision( | def content_add_to_revision( | ||||
self, revision: RevisionEntry, blob: FileEntry, prefix: bytes | self, revision: RevisionEntry, blob: FileEntry, prefix: bytes | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def content_find_first( | def content_find_first( | ||||
self, blobid: bytes | self, blobid: bytes | ||||
) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: | ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: | ||||
raise NotImplementedError | ... | ||||
def content_find_all( | def content_find_all( | ||||
self, blobid: bytes | self, blobid: bytes | ||||
) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: | ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: | ||||
raise NotImplementedError | ... | ||||
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: | def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: | ||||
raise NotImplementedError | ... | ||||
def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]: | def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]: | ||||
raise NotImplementedError | ... | ||||
def content_set_early_date(self, blob: FileEntry, date: datetime): | def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: | ||||
raise NotImplementedError | ... | ||||
def directory_add_to_revision( | def directory_add_to_revision( | ||||
self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes | self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def directory_get_date_in_isochrone_frontier( | def directory_get_date_in_isochrone_frontier( | ||||
self, directory: DirectoryEntry | self, directory: DirectoryEntry | ||||
) -> Optional[datetime]: | ) -> Optional[datetime]: | ||||
raise NotImplementedError | ... | ||||
def directory_get_dates_in_isochrone_frontier( | def directory_get_dates_in_isochrone_frontier( | ||||
self, dirs: List[DirectoryEntry] | self, dirs: List[DirectoryEntry] | ||||
) -> Dict[bytes, datetime]: | ) -> Dict[bytes, datetime]: | ||||
raise NotImplementedError | ... | ||||
def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry): | def directory_invalidate_in_isochrone_frontier( | ||||
raise NotImplementedError | self, directory: DirectoryEntry | ||||
) -> None: | |||||
... | |||||
def directory_set_date_in_isochrone_frontier( | def directory_set_date_in_isochrone_frontier( | ||||
self, directory: DirectoryEntry, date: datetime | self, directory: DirectoryEntry, date: datetime | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def origin_get_id(self, origin: OriginEntry) -> int: | def origin_get_id(self, origin: OriginEntry) -> int: | ||||
raise NotImplementedError | ... | ||||
def revision_add(self, revision: RevisionEntry): | def revision_add(self, revision: RevisionEntry) -> None: | ||||
raise NotImplementedError | ... | ||||
def revision_add_before_revision( | def revision_add_before_revision( | ||||
self, relative: RevisionEntry, revision: RevisionEntry | self, relative: RevisionEntry, revision: RevisionEntry | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): | def revision_add_to_origin( | ||||
raise NotImplementedError | self, origin: OriginEntry, revision: RevisionEntry | ||||
) -> None: | |||||
... | |||||
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | ||||
raise NotImplementedError | ... | ||||
def revision_get_preferred_origin(self, revision: RevisionEntry) -> int: | def revision_get_preferred_origin(self, revision: RevisionEntry) -> int: | ||||
raise NotImplementedError | ... | ||||
def revision_in_history(self, revision: RevisionEntry) -> bool: | def revision_in_history(self, revision: RevisionEntry) -> bool: | ||||
raise NotImplementedError | ... | ||||
def revision_set_preferred_origin( | def revision_set_preferred_origin( | ||||
self, origin: OriginEntry, revision: RevisionEntry | self, origin: OriginEntry, revision: RevisionEntry | ||||
): | ) -> None: | ||||
raise NotImplementedError | ... | ||||
def revision_visited(self, revision: RevisionEntry) -> bool: | def revision_visited(self, revision: RevisionEntry) -> bool: | ||||
raise NotImplementedError | ... | ||||
def directory_process_content( | def directory_process_content( | ||||
provenance: ProvenanceInterface, directory: DirectoryEntry, relative: DirectoryEntry | provenance: ProvenanceInterface, directory: DirectoryEntry, relative: DirectoryEntry | ||||
): | ) -> None: | ||||
stack = [(directory, b"")] | stack = [(directory, b"")] | ||||
while stack: | while stack: | ||||
current, prefix = stack.pop() | current, prefix = stack.pop() | ||||
for child in iter(current): | for child in iter(current): | ||||
if isinstance(child, FileEntry): | if isinstance(child, FileEntry): | ||||
# Add content to the relative directory with the computed prefix. | # Add content to the relative directory with the computed prefix. | ||||
provenance.content_add_to_directory(relative, child, prefix) | provenance.content_add_to_directory(relative, child, prefix) | ||||
else: | else: | ||||
# Recursively walk the child directory. | # Recursively walk the child directory. | ||||
stack.append((child, os.path.join(prefix, child.name))) | stack.append((child, os.path.join(prefix, child.name))) | ||||
def origin_add(provenance: ProvenanceInterface, origin: OriginEntry): | def origin_add(provenance: ProvenanceInterface, origin: OriginEntry) -> None: | ||||
# TODO: refactor to iterate over origin visit statuses and commit only once | # TODO: refactor to iterate over origin visit statuses and commit only once | ||||
# per status. | # per status. | ||||
origin.id = provenance.origin_get_id(origin) | origin.id = provenance.origin_get_id(origin) | ||||
for revision in origin.revisions: | for revision in origin.revisions: | ||||
origin_add_revision(provenance, origin, revision) | origin_add_revision(provenance, origin, revision) | ||||
# Commit after each revision | # Commit after each revision | ||||
provenance.commit() # TODO: verify this! | provenance.commit() # TODO: verify this! | ||||
def origin_add_revision( | def origin_add_revision( | ||||
provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry | provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry | ||||
): | ) -> None: | ||||
stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] | stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] | ||||
while stack: | while stack: | ||||
relative, current = stack.pop() | relative, current = stack.pop() | ||||
# Check if current revision has no preferred origin and update if necessary. | # Check if current revision has no preferred origin and update if necessary. | ||||
preferred = provenance.revision_get_preferred_origin(current) | preferred = provenance.revision_get_preferred_origin(current) | ||||
Show All 35 Lines | while stack: | ||||
# The parent revision already points to an origin, so its | # The parent revision already points to an origin, so its | ||||
# history was properly processed before. We just need to | # history was properly processed before. We just need to | ||||
# make sure it points to the current origin as well. | # make sure it points to the current origin as well. | ||||
provenance.revision_add_to_origin(origin, parent) | provenance.revision_add_to_origin(origin, parent) | ||||
def revision_add( | def revision_add( | ||||
provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry | provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry | ||||
): | ) -> None: | ||||
assert revision.date is not None | assert revision.date is not None | ||||
assert revision.root is not None | assert revision.root is not None | ||||
# Processed content starting from the revision's root directory. | # Processed content starting from the revision's root directory. | ||||
date = provenance.revision_get_early_date(revision) | date = provenance.revision_get_early_date(revision) | ||||
if date is None or revision.date < date: | if date is None or revision.date < date: | ||||
provenance.revision_add(revision) | provenance.revision_add(revision) | ||||
# TODO: add file size filtering | # TODO: add file size filtering | ||||
revision_process_content( | revision_process_content( | ||||
Show All 18 Lines | ) -> "IsochroneNode": | ||||
assert isinstance(self.entry, DirectoryEntry) and self.date is None | assert isinstance(self.entry, DirectoryEntry) and self.date is None | ||||
node = IsochroneNode(child, dates=dates) | node = IsochroneNode(child, dates=dates) | ||||
self.children.append(node) | self.children.append(node) | ||||
return node | return node | ||||
def build_isochrone_graph( | def build_isochrone_graph( | ||||
provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry | provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry | ||||
): | ) -> IsochroneNode: | ||||
assert revision.date is not None | assert revision.date is not None | ||||
# Build the nodes structure | # Build the nodes structure | ||||
root = IsochroneNode(directory) | root = IsochroneNode(directory) | ||||
root.date = provenance.directory_get_date_in_isochrone_frontier(directory) | root.date = provenance.directory_get_date_in_isochrone_frontier(directory) | ||||
stack = [root] | stack = [root] | ||||
while stack: | while stack: | ||||
current = stack.pop() | current = stack.pop() | ||||
assert isinstance(current.entry, DirectoryEntry) | assert isinstance(current.entry, DirectoryEntry) | ||||
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | |||||
def revision_process_content( | def revision_process_content( | ||||
provenance: ProvenanceInterface, revision: RevisionEntry, root: DirectoryEntry | provenance: ProvenanceInterface, revision: RevisionEntry, root: DirectoryEntry | ||||
): | ): | ||||
assert revision.date is not None | assert revision.date is not None | ||||
stack = [(build_isochrone_graph(provenance, revision, root), root.name)] | stack = [(build_isochrone_graph(provenance, revision, root), root.name)] | ||||
while stack: | while stack: | ||||
current, path = stack.pop() | current, path = stack.pop() | ||||
assert isinstance(current.entry, DirectoryEntry) | |||||
if current.date is not None: | if current.date is not None: | ||||
assert current.date < revision.date | assert current.date < revision.date | ||||
# Current directory is an outer isochrone frontier for a previously | # Current directory is an outer isochrone frontier for a previously | ||||
# processed revision. It should be reused as is. | # processed revision. It should be reused as is. | ||||
provenance.directory_add_to_revision(revision, current.entry, path) | provenance.directory_add_to_revision(revision, current.entry, path) | ||||
else: | else: | ||||
# Current directory is not an outer isochrone frontier for any previous | # Current directory is not an outer isochrone frontier for any previous | ||||
# revision. It might be eligible for this one. | # revision. It might be eligible for this one. | ||||
▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines |