diff --git a/swh/provenance/model.py b/swh/provenance/model.py index a04d8e8..37ff247 100644 --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -1,49 +1,36 @@ -import os - from .archive import ArchiveInterface -from pathlib import PosixPath - - -# class Tree: -# def __init__(self, archive: ArchiveInterface, id: bytes): -# self.root = DirectoryEntry(archive, id, PosixPath('.')) - class TreeEntry: - def __init__(self, id: bytes, name: PosixPath): + def __init__(self, id: bytes, name: bytes): self.id = id self.name = name class DirectoryEntry(TreeEntry): - def __init__(self, archive: ArchiveInterface, id: bytes, name: PosixPath): + def __init__(self, archive: ArchiveInterface, id: bytes, name: bytes): super().__init__(id, name) self.archive = archive self.children = None def __iter__(self): if self.children is None: self.children = [] for child in self.archive.directory_ls(self.id): if child["type"] == "dir": self.children.append( DirectoryEntry( self.archive, child["target"], - PosixPath(os.fsdecode(child["name"])), + child["name"], ) ) elif child["type"] == "file": - self.children.append( - FileEntry( - child["target"], PosixPath(os.fsdecode(child["name"])) - ) - ) + self.children.append(FileEntry(child["target"], child["name"])) return iter(self.children) class FileEntry(TreeEntry): pass diff --git a/swh/provenance/postgresql/db_utils.py b/swh/provenance/postgresql/db_utils.py index 483f6b1..8e406f3 100644 --- a/swh/provenance/postgresql/db_utils.py +++ b/swh/provenance/postgresql/db_utils.py @@ -1,62 +1,61 @@ import io import psycopg2 from configparser import ConfigParser -from pathlib import PosixPath -def config(filename: PosixPath, section: str): +def config(filename: str, section: str): # create a parser parser = ConfigParser() # read config file parser.read(filename) # get section, default to postgresql db = {} if parser.has_section(section): params = parser.items(section) for param in params: db[param[0]] = param[1] else: raise Exception(f"Section {section} not found in the {filename} file") return db def typecast_bytea(value, cur): if value is not None: data = psycopg2.BINARY(value, cur) return data.tobytes() def adapt_conn(conn): """Makes psycopg2 use 'bytes' to decode bytea instead of 'memoryview', for this connection.""" t_bytes = psycopg2.extensions.new_type((17,), "bytea", typecast_bytea) psycopg2.extensions.register_type(t_bytes, conn) t_bytes_array = psycopg2.extensions.new_array_type((1001,), "bytea[]", t_bytes) psycopg2.extensions.register_type(t_bytes_array, conn) def connect(params: dict): """ Connect to the PostgreSQL database server """ conn = None try: # connect to the PostgreSQL server conn = psycopg2.connect(**params) adapt_conn(conn) except (Exception, psycopg2.DatabaseError) as error: print(error) return conn -def execute_sql(conn: psycopg2.extensions.connection, filename: PosixPath): +def execute_sql(conn: psycopg2.extensions.connection, filename: str): with io.open(filename) as file: cur = conn.cursor() cur.execute(file.read()) cur.close() conn.commit() diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py index 2bcee4c..1c63dda 100644 --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -1,409 +1,406 @@ import itertools import logging import os import psycopg2 import psycopg2.extras from ..model import DirectoryEntry, FileEntry from ..origin import OriginEntry from .db_utils import connect, execute_sql from ..provenance import ProvenanceInterface from ..revision import RevisionEntry from datetime import datetime -from pathlib import PosixPath from typing import Any, Dict, List, Optional -def normalize(path: PosixPath) -> PosixPath: - spath = str(path) - if spath.startswith("./"): - return PosixPath(spath[2:]) - return path +def normalize(path: bytes) -> bytes: + return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) # Normalize dbname to avoid issues when reconnecting below name = name.casefold() # Create new database dropping previous one if exists cursor = conn.cursor() cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") cursor.execute(f"""CREATE DATABASE {name}""") conn.close() # Reconnect to server selecting newly created database to add tables conninfo["dbname"] = name conn = connect(conninfo) sqldir = os.path.dirname(os.path.realpath(__file__)) - execute_sql(conn, PosixPath(os.path.join(sqldir, "provenance.sql"))) + execute_sql(conn, os.path.join(sqldir, "provenance.sql")) -################################################################################ -################################################################################ -################################################################################ +######################################################################################## +######################################################################################## +######################################################################################## class ProvenancePostgreSQL(ProvenanceInterface): def __init__(self, conn: psycopg2.extensions.connection): # TODO: consider adding a mutex for thread safety conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) self.conn = conn self.cursor = self.conn.cursor() self.insert_cache: Dict[str, Any] = {} self.remove_cache: Dict[str, Any] = {} self.select_cache: Dict[str, Any] = {} self.clear_caches() def clear_caches(self): self.insert_cache = { "content": dict(), "content_early_in_rev": list(), "content_in_dir": list(), "directory": dict(), "directory_in_rev": list(), "revision": dict(), "revision_before_rev": list(), "revision_in_org": list(), } self.remove_cache = {"directory": dict()} self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()} def commit(self): result = False try: self.insert_all() # self.conn.commit() result = True # except psycopg2.DatabaseError: # # Database error occurred, rollback all changes # self.conn.rollback() # # TODO: maybe serialize and auto-merge transations. # # The only conflicts are on: # # - content: we keep the earliest date # # - directory: we keep the earliest date # # - content_in_dir: there should be just duplicated entries. except Exception as error: # Unexpected error occurred, rollback all changes and log message logging.error(f"Unexpected error: {error}") # self.conn.rollback() finally: self.clear_caches() return result def content_add_to_directory( - self, directory: DirectoryEntry, blob: FileEntry, prefix: PosixPath + self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes ): self.insert_cache["content_in_dir"].append( - (blob.id, directory.id, bytes(normalize(prefix / blob.name))) + (blob.id, directory.id, normalize(os.path.join(prefix, blob.name))) ) def content_add_to_revision( - self, revision: RevisionEntry, blob: FileEntry, prefix: PosixPath + self, revision: RevisionEntry, blob: FileEntry, prefix: bytes ): self.insert_cache["content_early_in_rev"].append( - (blob.id, revision.id, bytes(normalize(prefix / blob.name))) + (blob.id, revision.id, normalize(os.path.join(prefix, blob.name))) ) - def content_find_first(self, blobid: str): + def content_find_first(self, blobid: bytes): self.cursor.execute( """SELECT blob, rev, date, path FROM content_early_in_rev JOIN revision ON revision.id=content_early_in_rev.rev WHERE content_early_in_rev.blob=%s ORDER BY date, rev, path ASC LIMIT 1""", (blobid,), ) return self.cursor.fetchone() - def content_find_all(self, blobid: str): + def content_find_all(self, blobid: bytes): self.cursor.execute( """(SELECT blob, rev, date, path FROM content_early_in_rev JOIN revision ON revision.id=content_early_in_rev.rev WHERE content_early_in_rev.blob=%s) UNION (SELECT content_in_rev.blob, content_in_rev.rev, revision.date, content_in_rev.path FROM (SELECT content_in_dir.blob, directory_in_rev.rev, CASE directory_in_rev.path + WHEN '' THEN content_in_dir.path WHEN '.' THEN content_in_dir.path ELSE (directory_in_rev.path || '/' || content_in_dir.path)::unix_path END AS path FROM content_in_dir JOIN directory_in_rev ON content_in_dir.dir=directory_in_rev.dir WHERE content_in_dir.blob=%s ) AS content_in_rev JOIN revision ON revision.id=content_in_rev.rev ) ORDER BY date, rev, path""", (blobid, blobid), ) # POSTGRESQL EXPLAIN yield from self.cursor.fetchall() def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: # First check if the date is being modified by current transection. date = self.insert_cache["content"].get(blob.id, None) if date is None: # If not, check whether it's been query before date = self.select_cache["content"].get(blob.id, None) if date is None: # Otherwise, query the database and cache the value self.cursor.execute( """SELECT date FROM content WHERE id=%s""", (blob.id,) ) row = self.cursor.fetchone() date = row[0] if row is not None else None self.select_cache["content"][blob.id] = date return date def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]: dates = {} pending = [] for blob in blobs: # First check if the date is being modified by current transection. date = self.insert_cache["content"].get(blob.id, None) if date is not None: dates[blob.id] = date else: # If not, check whether it's been query before date = self.select_cache["content"].get(blob.id, None) if date is not None: dates[blob.id] = date else: pending.append(blob.id) if pending: # Otherwise, query the database and cache the values values = ", ".join(itertools.repeat("%s", len(pending))) self.cursor.execute( f"""SELECT id, date FROM content WHERE id IN ({values})""", tuple(pending), ) for row in self.cursor.fetchall(): dates[row[0]] = row[1] self.select_cache["content"][row[0]] = row[1] return dates def content_set_early_date(self, blob: FileEntry, date: datetime): self.insert_cache["content"][blob.id] = date def directory_add_to_revision( - self, revision: RevisionEntry, directory: DirectoryEntry, path: PosixPath + self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes ): self.insert_cache["directory_in_rev"].append( - (directory.id, revision.id, bytes(normalize(path))) + (directory.id, revision.id, normalize(path)) ) def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: # First check if the date is being modified by current transection. date = self.insert_cache["directory"].get(directory.id, None) if date is None and directory.id not in self.remove_cache["directory"]: # If not, check whether it's been query before date = self.select_cache["directory"].get(directory.id, None) if date is None: # Otherwise, query the database and cache the value self.cursor.execute( """SELECT date FROM directory WHERE id=%s""", (directory.id,) ) row = self.cursor.fetchone() date = row[0] if row is not None else None self.select_cache["directory"][directory.id] = date return date def directory_get_dates_in_isochrone_frontier( self, dirs: List[DirectoryEntry] ) -> Dict[bytes, datetime]: dates = {} pending = [] for directory in dirs: # First check if the date is being modified by current transection. date = self.insert_cache["directory"].get(directory.id, None) if date is not None: dates[directory.id] = date elif directory.id not in self.remove_cache["directory"]: # If not, check whether it's been query before date = self.select_cache["directory"].get(directory.id, None) if date is not None: dates[directory.id] = date else: pending.append(directory.id) if pending: # Otherwise, query the database and cache the values values = ", ".join(itertools.repeat("%s", len(pending))) self.cursor.execute( f"""SELECT id, date FROM directory WHERE id IN ({values})""", tuple(pending), ) for row in self.cursor.fetchall(): dates[row[0]] = row[1] self.select_cache["directory"][row[0]] = row[1] return dates def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry): self.remove_cache["directory"][directory.id] = None self.insert_cache["directory"].pop(directory.id, None) def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime ): self.insert_cache["directory"][directory.id] = date self.remove_cache["directory"].pop(directory.id, None) def insert_all(self): # Performe insertions with cached information if self.insert_cache["content"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO content(id, date) VALUES %s ON CONFLICT (id) DO UPDATE SET date=LEAST(EXCLUDED.date,content.date)""", self.insert_cache["content"].items(), ) if self.insert_cache["content_early_in_rev"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO content_early_in_rev VALUES %s ON CONFLICT DO NOTHING""", self.insert_cache["content_early_in_rev"], ) if self.insert_cache["content_in_dir"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO content_in_dir VALUES %s ON CONFLICT DO NOTHING""", self.insert_cache["content_in_dir"], ) if self.insert_cache["directory"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO directory(id, date) VALUES %s ON CONFLICT (id) DO UPDATE SET date=LEAST(EXCLUDED.date,directory.date)""", self.insert_cache["directory"].items(), ) if self.insert_cache["directory_in_rev"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO directory_in_rev VALUES %s ON CONFLICT DO NOTHING""", self.insert_cache["directory_in_rev"], ) if self.insert_cache["revision"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO revision(id, date) VALUES %s ON CONFLICT (id) DO UPDATE SET date=LEAST(EXCLUDED.date,revision.date)""", self.insert_cache["revision"].items(), ) if self.insert_cache["revision_before_rev"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO revision_before_rev VALUES %s ON CONFLICT DO NOTHING""", self.insert_cache["revision_before_rev"], ) if self.insert_cache["revision_in_org"]: psycopg2.extras.execute_values( self.cursor, """INSERT INTO revision_in_org VALUES %s ON CONFLICT DO NOTHING""", self.insert_cache["revision_in_org"], ) def origin_get_id(self, origin: OriginEntry) -> int: if origin.id is None: # Check if current origin is already known and retrieve its internal id. self.cursor.execute("""SELECT id FROM origin WHERE url=%s""", (origin.url,)) row = self.cursor.fetchone() if row is None: # If the origin is seen for the first time, current revision is # the prefered one. self.cursor.execute( """INSERT INTO origin (url) VALUES (%s) RETURNING id""", (origin.url,), ) return self.cursor.fetchone()[0] else: return row[0] else: return origin.id def revision_add(self, revision: RevisionEntry): # Add current revision to the compact DB self.insert_cache["revision"][revision.id] = revision.date def revision_add_before_revision( self, relative: RevisionEntry, revision: RevisionEntry ): self.insert_cache["revision_before_rev"].append((revision.id, relative.id)) def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): self.insert_cache["revision_in_org"].append((revision.id, origin.id)) def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: date = self.insert_cache["revision"].get(revision.id, None) if date is None: # If not, check whether it's been query before date = self.select_cache["revision"].get(revision.id, None) if date is None: # Otherwise, query the database and cache the value self.cursor.execute( """SELECT date FROM revision WHERE id=%s""", (revision.id,) ) row = self.cursor.fetchone() date = row[0] if row is not None else None self.select_cache["revision"][revision.id] = date return date def revision_get_prefered_origin(self, revision: RevisionEntry) -> int: # TODO: adapt this method to consider cached values self.cursor.execute( """SELECT COALESCE(org,0) FROM revision WHERE id=%s""", (revision.id,) ) row = self.cursor.fetchone() # None means revision is not in database # 0 means revision has no prefered origin return row[0] if row is not None and row[0] != 0 else None def revision_in_history(self, revision: RevisionEntry) -> bool: # TODO: adapt this method to consider cached values self.cursor.execute( """SELECT 1 FROM revision_before_rev WHERE prev=%s""", (revision.id,) ) return self.cursor.fetchone() is not None def revision_set_prefered_origin( self, origin: OriginEntry, revision: RevisionEntry ): # TODO: adapt this method to consider cached values self.cursor.execute( """UPDATE revision SET org=%s WHERE id=%s""", (origin.id, revision.id) ) def revision_visited(self, revision: RevisionEntry) -> bool: # TODO: adapt this method to consider cached values self.cursor.execute( """SELECT 1 FROM revision_in_org WHERE rev=%s""", (revision.id,) ) return self.cursor.fetchone() is not None diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py index d913d23..43e4447 100644 --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -1,425 +1,425 @@ +import os + from .archive import ArchiveInterface from .model import DirectoryEntry, FileEntry from .origin import OriginEntry from .revision import RevisionEntry from datetime import datetime -from pathlib import PosixPath from typing import Dict, List, Optional, Tuple # TODO: consider moving to path utils file together with normalize. -def is_child(path: PosixPath, prefix: PosixPath) -> bool: - # PosixPath returns '.' as parent when there is not upper directory. First check - # avoids considering current directory its own parent. - return path.parent != path and path.parent == prefix +def is_child(path: bytes, prefix: bytes) -> bool: + return path != prefix and os.path.dirname(path) == prefix + + +# def is_not_prefix(prefix: bytes, path: bytes) -> bool: +# return not path.startswith(prefix) class ProvenanceInterface: def __init__(self, **kwargs): raise NotImplementedError def commit(self): raise NotImplementedError def content_add_to_directory( - self, directory: DirectoryEntry, blob: FileEntry, prefix: PosixPath + self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes ): raise NotImplementedError def content_add_to_revision( - self, revision: RevisionEntry, blob: FileEntry, prefix: PosixPath + self, revision: RevisionEntry, blob: FileEntry, prefix: bytes ): raise NotImplementedError - def content_find_first(self, blobid: str): + def content_find_first(self, blobid: bytes): raise NotImplementedError - def content_find_all(self, blobid: str): + def content_find_all(self, blobid: bytes): raise NotImplementedError def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: raise NotImplementedError def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]: raise NotImplementedError def content_set_early_date(self, blob: FileEntry, date: datetime): raise NotImplementedError def directory_add_to_revision( - self, revision: RevisionEntry, directory: DirectoryEntry, path: PosixPath + self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes ): raise NotImplementedError def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: raise NotImplementedError def directory_get_dates_in_isochrone_frontier( self, dirs: List[DirectoryEntry] ) -> Dict[bytes, datetime]: raise NotImplementedError def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry): raise NotImplementedError def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime ): raise NotImplementedError def origin_get_id(self, origin: OriginEntry) -> int: raise NotImplementedError def revision_add(self, revision: RevisionEntry): raise NotImplementedError def revision_add_before_revision( self, relative: RevisionEntry, revision: RevisionEntry ): raise NotImplementedError def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): raise NotImplementedError def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: raise NotImplementedError def revision_get_prefered_origin(self, revision: RevisionEntry) -> int: raise NotImplementedError def revision_in_history(self, revision: RevisionEntry) -> bool: raise NotImplementedError def revision_set_prefered_origin( self, origin: OriginEntry, revision: RevisionEntry ): raise NotImplementedError def revision_visited(self, revision: RevisionEntry) -> bool: raise NotImplementedError def directory_process_content( provenance: ProvenanceInterface, directory: DirectoryEntry, - relative: DirectoryEntry, - prefix: PosixPath, + relative: DirectoryEntry ): - stack = [(directory, prefix)] + stack = [(directory, b"")] while stack: - current, path = stack.pop() + current, prefix = stack.pop() for child in iter(current): if isinstance(child, FileEntry): - # Add content to the relative directory with the computed path. - provenance.content_add_to_directory(relative, child, path) + # Add content to the relative directory with the computed prefix. + provenance.content_add_to_directory(relative, child, prefix) else: # Recursively walk the child directory. - stack.append((child, path / child.name)) + stack.append((child, os.path.join(prefix, child.name))) def directory_update_content( - stack: List[Tuple[DirectoryEntry, PosixPath]], + stack: List[Tuple[DirectoryEntry, bytes]], provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry, - path: PosixPath, + prefix: bytes, subdirs: Optional[List[DirectoryEntry]] = None, blobs: Optional[List[FileEntry]] = None, blobdates: Optional[Dict[bytes, datetime]] = None, ): assert revision.date is not None # Init optional parameters if not provided. if subdirs is None: subdirs = [child for child in directory if isinstance(child, DirectoryEntry)] if blobs is None: blobs = [child for child in directory if isinstance(child, FileEntry)] if blobdates is None: blobdates = provenance.content_get_early_dates(blobs) # Iterate over blobs updating their date if necessary. for blob in blobs: date = blobdates.get(blob.id, None) if date is None or revision.date < date: provenance.content_set_early_date(blob, revision.date) - # provenance.content_add_to_revision(revision, blob, path) # Push all subdirectories with its corresponding path to analyze them # recursively. for subdir in subdirs: - stack.append((subdir, path / subdir.name)) + stack.append((subdir, os.path.join(prefix, subdir.name))) def origin_add(provenance: ProvenanceInterface, origin: OriginEntry): # TODO: refactor to iterate over origin visit statuses and commit only once # per status. origin.id = provenance.origin_get_id(origin) for revision in origin.revisions: origin_add_revision(provenance, origin, revision) # Commit after each revision provenance.commit() # TODO: verify this! def origin_add_revision( provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry ): stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] while stack: relative, current = stack.pop() # Check if current revision has no prefered origin and update if necessary. prefered = provenance.revision_get_prefered_origin(current) if prefered is None: provenance.revision_set_prefered_origin(origin, current) ######################################################################## if relative is None: # This revision is pointed directly by the origin. visited = provenance.revision_visited(current) provenance.revision_add_to_origin(origin, current) if not visited: stack.append((current, current)) else: # This revision is a parent of another one in the history of the # relative revision. for parent in iter(current): visited = provenance.revision_visited(parent) if not visited: # The parent revision has never been seen before pointing # directly to an origin. known = provenance.revision_in_history(parent) if known: # The parent revision is already known in some other # revision's history. We should point it directly to # the origin and (eventually) walk its history. stack.append((None, parent)) else: # The parent revision was never seen before. We should # walk its history and associate it with the same # relative revision. provenance.revision_add_before_revision(relative, parent) stack.append((relative, parent)) else: # The parent revision already points to an origin, so its # history was properly processed before. We just need to # make sure it points to the current origin as well. provenance.revision_add_to_origin(origin, parent) def revision_add( provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry ): assert revision.date is not None assert revision.root is not None # Processed content starting from the revision's root directory date = provenance.revision_get_early_date(revision) if date is None or revision.date < date: provenance.revision_add(revision) revision_process_content( - provenance, revision, DirectoryEntry(archive, revision.root, PosixPath(".")) + provenance, revision, DirectoryEntry(archive, revision.root, b"") ) return provenance.commit() def revision_process_content( provenance: ProvenanceInterface, revision: RevisionEntry, root: DirectoryEntry ): assert revision.date is not None # Stack of directories (and their paths) to be processed. - stack: List[Tuple[DirectoryEntry, PosixPath]] = [(root, root.name)] + stack: List[Tuple[DirectoryEntry, bytes]] = [(root, root.name)] # This dictionary will hold the computed dates for visited subdirectories inside the # isochrone frontier. - innerdirs: Dict[PosixPath, Tuple[DirectoryEntry, datetime]] = {} + innerdirs: Dict[bytes, Tuple[DirectoryEntry, datetime]] = {} # This dictionary will hold the computed dates for visited subdirectories outside # the isochrone frontier which are candidates to be added to the outer frontier (if # their parent is in the inner frontier). - outerdirs: Dict[PosixPath, Tuple[DirectoryEntry, datetime]] = {} + outerdirs: Dict[bytes, Tuple[DirectoryEntry, datetime]] = {} while stack: # Get next directory to process and query its date right before processing to be # sure we get the most recently updated value. current, prefix = stack.pop() date = provenance.directory_get_date_in_isochrone_frontier(current) if date is None: # The directory has never been seen on the outer isochrone frontier of # previously processed revisions. Its children should be analyzed. blobs = [child for child in current if isinstance(child, FileEntry)] subdirs = [child for child in current if isinstance(child, DirectoryEntry)] # Get the list of ids with no duplicates to ensure we have available dates # for all the elements. This prevents taking a wrong decision when a blob # occurs more than once in the same directory. ids = list(dict.fromkeys([child.id for child in blobs + subdirs])) if ids: # Known dates for the blobs in the current directory. blobdates = provenance.content_get_early_dates(blobs) # Known dates for the subdirectories in the current directory that # belong to the outer isochrone frontier of some previously processed # revision. knowndates = provenance.directory_get_dates_in_isochrone_frontier( subdirs ) # Known dates for the subdirectories in the current directory that are # inside the isochrone frontier of the revision. innerdates = { innerdir.id: innerdate for path, (innerdir, innerdate) in innerdirs.items() if is_child(path, prefix) } # Known dates for the subdirectories in the current directory that are # outside the isochrone frontier of the revision. outerdates = { outerdir.id: outerdate for path, (outerdir, outerdate) in outerdirs.items() if is_child(path, prefix) } # All known dates for child nodes of the current directory. assert not (innerdates.keys() & outerdates.keys()) dates = list( {**blobdates, **knowndates, **innerdates, **outerdates}.values() ) if len(dates) == len(ids): # All child nodes of current directory are already known. maxdate = max(dates) if maxdate < revision.date: # The directory is outside the isochrone frontier of the # revision. It is a candidate to be added to the outer frontier. outerdirs[prefix] = (current, maxdate) # Its children are removed since they are no longer candidates. outerdirs = { path: outerdir for path, outerdir in outerdirs.items() if not is_child(path, prefix) } elif maxdate == revision.date: # The current directory is inside the isochrone frontier. innerdirs[prefix] = (current, revision.date) # Add blobs present in this level to the revision. No need to # update dates as they are at most equal to current one. for blob in blobs: provenance.content_add_to_revision(revision, blob, prefix) # If any of its children was found outside the frontier it # should be added to the outer frontier now. if outerdates: for path, (outerdir, outerdate) in outerdirs.items(): if is_child(path, prefix): provenance.directory_set_date_in_isochrone_frontier( outerdir, outerdate ) provenance.directory_add_to_revision( revision, outerdir, path ) directory_process_content( provenance, directory=outerdir, - relative=outerdir, - prefix=PosixPath("."), + relative=outerdir ) # Removed processed elements to avoid duplicating work. outerdirs = { path: outerdir for path, outerdir in outerdirs.items() if not is_child(path, prefix) } # There can still be subdirectories that are known to be in the # outter isochrone frontier of previous processed revisions. # Thus, they are not in the list of candidates but have to be # added to current revisions as well. for subdir in subdirs: knowndate = knowndates.get(subdir.id, None) if knowndate is not None and knowndate <= revision.date: # Less or equal since the directory could have been # added to the outer isochrone frontier when processing # a different directory's subtree of this very same # revision. provenance.directory_add_to_revision( - revision, subdir, prefix / subdir.name + revision, subdir, os.path.join(prefix, subdir.name) ) else: # The revision is out of order. The current directory does not # belong to the outer isochrone frontier of any previously # processed revision yet all its children nodes are known. They # should be re-analyzed (and timestamps eventually updated) and # current directory updated after them. stack.append((current, prefix)) directory_update_content( stack, provenance, revision, current, prefix, subdirs=subdirs, blobs=blobs, blobdates=blobdates, ) else: # Al least one child node is unknown, ie. the current directory is # inside the isochrone frontier of the current revision. Its child # nodes should be analyzed and current directory updated after them. stack.append((current, prefix)) directory_update_content( stack, provenance, revision, current, prefix, subdirs=subdirs, blobs=blobs, blobdates=blobdates, ) else: # Empty directory. Just consider it to be in the inner frontier of # current revision (ie. all its children are already "known"). innerdirs[prefix] = (current, revision.date) elif revision.date < date: # The revision is out of order. The current directory belongs to the outer # isochrone frontier of some previously processed revison but current # revision is earlier. The frontier record should be invalidated, children # nodes re-analyzed (and timestamps eventually updated) and current # directory updated after them. stack.append((current, prefix)) provenance.directory_invalidate_in_isochrone_frontier(current) directory_update_content(stack, provenance, revision, current, prefix) else: # The directory has already been seen on the outer isochrone frontier of an # earlier revision. Just add it to the current revision. provenance.directory_add_to_revision(revision, current, prefix) if outerdirs: # This should only happen if the root directory is in the outer frontier. + if not (len(outerdirs) == 1 and root.name in outerdirs): + print(outerdirs) assert len(outerdirs) == 1 and root.name in outerdirs outerdir, outerdate = outerdirs[root.name] provenance.directory_set_date_in_isochrone_frontier(outerdir, outerdate) provenance.directory_add_to_revision(revision, outerdir, root.name) - directory_process_content( - provenance, directory=outerdir, relative=outerdir, prefix=PosixPath(".") - ) + directory_process_content(provenance, directory=outerdir, relative=outerdir)