Differential D5925 Diff 21456 swh/provenance/postgresql/archive.py

Changeset View

Standalone View

swh/provenance/postgresql/archive.py

from typing import Any, Dict, Iterable, List, Set from typing import Any, Dict, Iterable, List, Set

from methodtools import lru_cache from methodtools import lru_cache

import psycopg2 import psycopg2

from swh.model.model import ObjectType, Revision, Sha1Git, TargetType from swh.model.model import ObjectType, Sha1Git, TargetType

from swh.storage.postgresql.storage import Storage from swh.storage.postgresql.storage import Storage

class ArchivePostgreSQL: class ArchivePostgreSQL:

def __init__(self, conn: psycopg2.extensions.connection): def __init__(self, conn: psycopg2.extensions.connection):

self.conn = conn self.conn = conn

self.storage = Storage(conn, objstorage={"cls": "memory"}) self.storage = Storage(conn, objstorage={"cls": "memory"})

Show All 33 Lines def directory_ls_internal(self, id: Sha1Git) -> List[Dict[str, Any]]:

LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id

LEFT JOIN skipped_content c ON e.target=c.sha1_git LEFT JOIN skipped_content c ON e.target=c.sha1_git

WHERE NOT EXISTS ( WHERE NOT EXISTS (

SELECT 1 FROM known_contents SELECT 1 FROM known_contents

WHERE known_contents.sha1_git=e.target WHERE known_contents.sha1_git=e.target

) )

ORDER BY name ORDER BY name

douarddaUnsubmitted

Done

why is this removed?

douardda: why is this removed?

aevisoAuthorUnsubmitted

Done

there's no need to guaranty any order when iterating the contents of a directory

aeviso: there's no need to guaranty any order when iterating the contents of a directory

""", """,

(id,), (id,),

) )

return [ return [

{"type": row[0], "target": row[1], "name": row[2]} {"type": row[0], "target": row[1], "name": row[2]}

for row in cursor.fetchall() for row in cursor.fetchall()

] ]

def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]:

with self.conn.cursor() as cursor: with self.conn.cursor() as cursor:

psycopg2.extras.execute_values( cursor.execute(

cursor,

""" """

SELECT t.id, revision.date, revision.directory, SELECT RH.parent_id::bytea

ARRAY( FROM revision_history AS RH

SELECT rh.parent_id::bytea WHERE RH.id=%s

FROM revision_history rh ORDER BY RH.parent_rank

douarddaUnsubmitted

Done

since you do only return elements from this array for one revision only, I don't think you need to recreate the ARRAY structure.

douardda: since you do only return elements from this array for one revision only, I don't think you need…

douarddaUnsubmitted

Done

cursor.execute(

"""

- SELECT ARRAY(

- SELECT RH.parent_id::bytea

- FROM revision_history AS RH

- WHERE RH.id=R.id

- ORDER BY RH.parent_rank

- )

- FROM revision AS R WHERE R.id=%s

+ SELECT RH.parent_id::bytea

+ FROM revision_history AS RH

+ WHERE RH.id=%s

+ ORDER BY RH.parent_rank

""",

since you do only return elements from this array for one revision only, I don't think you need to recreate the ARRAY structure.

If using suggested SQL code, then just yield from returned rows.

douardda: since you do only return elements from this array for one revision only, I don't think you need…

aevisoAuthorUnsubmitted

Done

I agree the query can be further simplified, but wouldn't row be a tuple with a single Sha1Git element? I guess I still need to yield row[0]

aeviso: I agree the query can be further simplified, but wouldn't `row` be a tuple with a single…

douarddaUnsubmitted

Done

yes but the idea is to prevent sending unused data on the wire (pg connection).

douardda: yes but the idea is to prevent sending unused data on the wire (pg connection).

WHERE rh.id = t.id

ORDER BY rh.parent_rank

)

FROM (VALUES %s) as t(sortkey, id)

LEFT JOIN revision ON t.id = revision.id

LEFT JOIN person author ON revision.author = author.id

LEFT JOIN person committer ON revision.committer = committer.id

ORDER BY sortkey

""", """,

((sortkey, id) for sortkey, id in enumerate(ids)), (id,),

)

for row in cursor.fetchall():

parents = []

for parent in row[3]:

if parent:

parents.append(parent)

yield Revision.from_dict(

{

"id": row[0],

"date": row[1],

"directory": row[2],

"parents": tuple(parents),

}

) )

# There should be at most one row anyway

yield from (row[0] for row in cursor.fetchall())

def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:

# TODO: this code is duplicated here (same as in swh.provenance.storage.archive) # TODO: this code is duplicated here (same as in swh.provenance.storage.archive)

# but it's just temporary. This method should actually perform a direct query to # but it's just temporary. This method should actually perform a direct query to

# the SQL db of the archive. # the SQL db of the archive.

from swh.core.utils import grouper from swh.core.utils import grouper

from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.algos.snapshot import snapshot_get_all_branches

Show All 19 Lines def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:

revisions: Set[Sha1Git] = set() revisions: Set[Sha1Git] = set()

for targets in grouper(targets_set, batchsize): for targets in grouper(targets_set, batchsize):

revisions.update( revisions.update(

revision.id revision.id

for revision in self.storage.revision_get(targets) for revision in self.storage.revision_get(targets)

if revision is not None if revision is not None

) )

yield from revisions yield from revisions

douarddaUnsubmitted

Done

print statement

douardda: print statement

douarddaUnsubmitted

Done

cursor.execute(

"""

WITH S AS (SELECT object_id FROM snapshot WHERE snapshot.id=%s)

(SELECT B.target AS head

FROM S

JOIN snapshot_branches AS BS ON (S.object_id=BS.snapshot_id)

JOIN snapshot_branch AS B ON (BS.branch_id=B.object_id)

- WHERE B.target_type='revision'::snapshot_target)

- UNION

- (SELECT R.target AS head

- FROM S

- JOIN snapshot_branches AS BS ON (S.object_id=BS.snapshot_id)

- JOIN snapshot_branch AS B ON (BS.branch_id=B.object_id)

- JOIN release AS R ON (B.target=R.id)

- WHERE B.target_type='release'::snapshot_target

- AND R.target_type='revision'::object_type)

+ WHERE B.target_type='revision'::snapshot_target) OR

+ (B.target_type='release'::snapshot_target

+ AND R.target_type='revision'::object_type))

""",

An OR should do the trick here

douardda: An `OR` should do the trick here

aevisoAuthorUnsubmitted

Done

OK, I'll try it

aeviso: OK, I'll try it

aevisoAuthorUnsubmitted

Done

Sorry, I don't see how these queries are equivalent. There is a missing JOIN in the proposed one

aeviso: Sorry, I don't see how these queries are equivalent. There is a missing `JOIN` in the proposed…

douarddaUnsubmitted

Done

you're right, I missed that. You can probably just pack the missing join in there, I don't think it would hurt the query performances (to be checked), so something like:

WITH S AS 
(SELECT object_id FROM snapshot WHERE snapshot.id=%s )

SELECT B.target AS head
  FROM S
  JOIN snapshot_branches AS BS ON (S.object_id=BS.snapshot_id)
  JOIN snapshot_branch AS B ON (BS.branch_id=B.object_id)
  LEFT JOIN release AS R ON (B.target=R.id)
WHERE B.target_type='release'::snapshot_target AND R.target_type='revision'::object_type;

douardda: you're right, I missed that. You can probably just pack the missing join in there, I don't…

aevisoAuthorUnsubmitted

Done

I rather keep the current query for now

aeviso: I rather keep the current query for now