diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -6,7 +6,6 @@ from __future__ import annotations from datetime import datetime, timezone -import logging import os from typing import Any, Dict, Optional, Set @@ -187,9 +186,6 @@ root_date = provenance.directory_get_date_in_isochrone_frontier(directory) root = IsochroneNode(directory, dbdate=root_date) stack = [root] - logging.debug( - f"Recursively creating isochrone graph for revision {revision.id.hex()}..." - ) fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() @@ -198,12 +194,6 @@ # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. if current.dbdate is not None and current.dbdate > revision.date: - logging.debug( - f"Invalidating frontier on {current.entry.id.hex()}" - f" (date {current.dbdate})" - f" when processing revision {revision.id.hex()}" - f" (date {revision.date})" - ) current.invalidate() # Pre-query all known dates for directories in the current directory @@ -220,12 +210,8 @@ fdates.update(provenance.content_get_early_dates(current.entry.files)) - logging.debug( - f"Isochrone graph for revision {revision.id.hex()} successfully created!" - ) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). - logging.debug(f"Computing maxdates for revision {revision.id.hex()}...") stack = [root] while stack: @@ -276,5 +262,4 @@ # node should be treated as unknown current.maxdate = revision.date current.known = False - logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!") return root diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -4,8 +4,6 @@ # See top-level LICENSE file for more information from itertools import islice -import logging -import time from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.model.model import Sha1Git @@ -49,21 +47,13 @@ archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: - start = time.time() for origin in origins: provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: graph = HistoryGraph(archive, provenance, revision) origin_add_revision(provenance, origin, graph) - done = time.time() provenance.flush() - stop = time.time() - logging.debug( - "Origins " - ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) - + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" - ) def origin_add_revision( diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -23,6 +23,8 @@ RevisionData, ) +LOGGER = logging.getLogger(__name__) + class ProvenanceStoragePostgreSql: def __init__( @@ -99,7 +101,7 @@ return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message - logging.exception("Unexpected error") + LOGGER.exception("Unexpected error") if self.raise_on_commit: raise return False @@ -138,7 +140,7 @@ return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message - logging.exception("Unexpected error") + LOGGER.exception("Unexpected error") if self.raise_on_commit: raise return False @@ -211,7 +213,7 @@ return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message - logging.exception("Unexpected error") + LOGGER.exception("Unexpected error") if self.raise_on_commit: raise return False @@ -261,7 +263,7 @@ return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message - logging.exception("Unexpected error") + LOGGER.exception("Unexpected error") if self.raise_on_commit: raise return False diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -20,6 +20,8 @@ ) from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry +LOGGER = logging.getLogger(__name__) + class DatetimeCache(TypedDict): data: Dict[Sha1Git, Optional[datetime]] @@ -79,41 +81,44 @@ # For this layer, relations need to be inserted first so that, in case of # failure, reprocessing the input does not generated an inconsistent database. - while not self.storage.relation_add( - RelationType.CNT_EARLY_IN_REV, - ( - RelationData(src=src, dst=dst, path=path) - for src, dst, path in self.cache["content_in_revision"] - ), - ): - logging.warning( - f"Unable to write {RelationType.CNT_EARLY_IN_REV} rows to the storage. " - f"Data: {self.cache['content_in_revision']}. Retrying..." - ) - - while not self.storage.relation_add( - RelationType.CNT_IN_DIR, - ( - RelationData(src=src, dst=dst, path=path) - for src, dst, path in self.cache["content_in_directory"] - ), - ): - logging.warning( - f"Unable to write {RelationType.CNT_IN_DIR} rows to the storage. " - f"Data: {self.cache['content_in_directory']}. Retrying..." - ) - - while not self.storage.relation_add( - RelationType.DIR_IN_REV, - ( - RelationData(src=src, dst=dst, path=path) - for src, dst, path in self.cache["directory_in_revision"] - ), - ): - logging.warning( - f"Unable to write {RelationType.DIR_IN_REV} rows to the storage. " - f"Data: {self.cache['directory_in_revision']}. Retrying..." - ) + if self.cache["content_in_revision"]: + while not self.storage.relation_add( + RelationType.CNT_EARLY_IN_REV, + ( + RelationData(src=src, dst=dst, path=path) + for src, dst, path in self.cache["content_in_revision"] + ), + ): + LOGGER.warning( + "Unable to write %s rows to the storage. Retrying...", + RelationType.CNT_EARLY_IN_REV, + ) + + if self.cache["content_in_directory"]: + while not self.storage.relation_add( + RelationType.CNT_IN_DIR, + ( + RelationData(src=src, dst=dst, path=path) + for src, dst, path in self.cache["content_in_directory"] + ), + ): + LOGGER.warning( + "Unable to write %s rows to the storage. Retrying...", + RelationType.CNT_IN_DIR, + ) + + if self.cache["directory_in_revision"]: + while not self.storage.relation_add( + RelationType.DIR_IN_REV, + ( + RelationData(src=src, dst=dst, path=path) + for src, dst, path in self.cache["directory_in_revision"] + ), + ): + LOGGER.warning( + "Unable to write %s rows to the storage. Retrying...", + RelationType.DIR_IN_REV, + ) # After relations, dates for the entities can be safely set, acknowledging that # these entities won't need to be reprocessed in case of failure. @@ -122,33 +127,33 @@ for sha1, date in self.cache["content"]["data"].items() if sha1 in self.cache["content"]["added"] and date is not None } - while not self.storage.content_set_date(dates): - logging.warning( - f"Unable to write content dates to the storage. " - f"Data: {dates}. Retrying..." - ) + if dates: + while not self.storage.content_set_date(dates): + LOGGER.warning( + "Unable to write content dates to the storage. Retrying..." + ) dates = { sha1: date for sha1, date in self.cache["directory"]["data"].items() if sha1 in self.cache["directory"]["added"] and date is not None } - while not self.storage.directory_set_date(dates): - logging.warning( - f"Unable to write directory dates to the storage. " - f"Data: {dates}. Retrying..." - ) + if dates: + while not self.storage.directory_set_date(dates): + LOGGER.warning( + "Unable to write directory dates to the storage. Retrying..." + ) dates = { sha1: date for sha1, date in self.cache["revision"]["data"].items() if sha1 in self.cache["revision"]["added"] and date is not None } - while not self.storage.revision_set_date(dates): - logging.warning( - f"Unable to write revision dates to the storage. " - f"Data: {dates}. Retrying..." - ) + if dates: + while not self.storage.revision_set_date(dates): + LOGGER.warning( + "Unable to write revision dates to the storage. Retrying..." + ) # Origin-revision layer insertions ############################################# @@ -159,11 +164,11 @@ for sha1, url in self.cache["origin"]["data"].items() if sha1 in self.cache["origin"]["added"] } - while not self.storage.origin_set_url(urls): - logging.warning( - f"Unable to write origins urls to the storage. " - f"Data: {urls}. Retrying..." - ) + if urls: + while not self.storage.origin_set_url(urls): + LOGGER.warning( + "Unable to write origins urls to the storage. Retrying..." + ) # Second, flat models for revisions' histories (ie. revision-before-revision). data: Iterable[RelationData] = sum( @@ -176,11 +181,12 @@ ], [], ) - while not self.storage.relation_add(RelationType.REV_BEFORE_REV, data): - logging.warning( - f"Unable to write {RelationType.REV_BEFORE_REV} rows to the storage. " - f"Data: {data}. Retrying..." - ) + if data: + while not self.storage.relation_add(RelationType.REV_BEFORE_REV, data): + LOGGER.warning( + "Unable to write %s rows to the storage. Retrying...", + RelationType.REV_BEFORE_REV, + ) # Heads (ie. revision-in-origin entries) should be inserted once flat models for # their histories were already added. This is to guarantee consistent results if @@ -190,11 +196,12 @@ RelationData(src=rev, dst=org, path=None) for rev, org in self.cache["revision_in_origin"] ) - while not self.storage.relation_add(RelationType.REV_IN_ORG, data): - logging.warning( - f"Unable to write {RelationType.REV_IN_ORG} rows to the storage. " - f"Data: {data}. Retrying..." - ) + if data: + while not self.storage.relation_add(RelationType.REV_IN_ORG, data): + LOGGER.warning( + "Unable to write %s rows to the storage. Retrying...", + RelationType.REV_IN_ORG, + ) # Finally, preferred origins for the visited revisions are set (this step can be # reordered if required). @@ -202,11 +209,11 @@ sha1: self.cache["revision_origin"]["data"][sha1] for sha1 in self.cache["revision_origin"]["added"] } - while not self.storage.revision_set_origin(origins): - logging.warning( - f"Unable to write preferred origins to the storage. " - f"Data: {origins}. Retrying..." - ) + if origins: + while not self.storage.revision_set_origin(origins): + LOGGER.warning( + "Unable to write preferred origins to the storage. Retrying..." + ) # clear local cache ############################################################ self.clear_caches() diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -4,9 +4,7 @@ # See top-level LICENSE file for more information from datetime import datetime, timezone -import logging import os -import time from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.model.model import Sha1Git @@ -59,17 +57,12 @@ mindepth: int = 1, commit: bool = True, ) -> None: - start = time.time() for revision in revisions: assert revision.date is not None assert revision.root is not None # Processed content starting from the revision's root directory. date = provenance.revision_get_date(revision) if date is None or revision.date < date: - logging.debug( - f"Processing revisions {revision.id.hex()}" - f" (known date {date} / revision date {revision.date})..." - ) graph = build_isochrone_graph( archive, provenance, @@ -86,14 +79,8 @@ lower=lower, mindepth=mindepth, ) - done = time.time() if commit: provenance.flush() - stop = time.time() - logging.debug( - f"Revisions {';'.join([revision.id.hex() for revision in revisions])} " - f" were processed in {stop - start} secs (commit took {stop - done} secs)!" - ) def revision_process_content(