Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/storage.py
Show All 10 Lines | |||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Counter, | Counter, | ||||
Dict, | Dict, | ||||
Iterable, | Iterable, | ||||
List, | List, | ||||
Optional, | Optional, | ||||
Tuple, | |||||
Union, | Union, | ||||
) | ) | ||||
import attr | import attr | ||||
import psycopg2 | import psycopg2 | ||||
import psycopg2.pool | import psycopg2.pool | ||||
import psycopg2.errors | import psycopg2.errors | ||||
▲ Show 20 Lines • Show All 932 Lines • ▼ Show 20 Lines | ) -> Optional[Dict[str, Any]]: | ||||
visit = dict(zip(db.origin_visit_get_cols, row)) | visit = dict(zip(db.origin_visit_get_cols, row)) | ||||
return self._origin_visit_apply_update(visit, db) | return self._origin_visit_apply_update(visit, db) | ||||
return None | return None | ||||
@timed | @timed | ||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_get_random( | def origin_visit_get_random( | ||||
self, type: str, db=None, cur=None | self, type: str, db=None, cur=None | ||||
) -> Optional[Dict[str, Any]]: | ) -> Optional[Tuple[OriginVisit, OriginVisitStatus]]: | ||||
row = db.origin_visit_get_random(type, cur) | row = db.origin_visit_get_random(type, cur) | ||||
if row: | if row is not None: | ||||
visit = dict(zip(db.origin_visit_get_cols, row)) | visit = OriginVisit(origin=row[0], visit=row[1], date=row[2], type=row[3],) | ||||
return self._origin_visit_apply_update(visit, db) | visit_status = OriginVisitStatus( | ||||
origin=row[0], | |||||
visit=row[1], | |||||
date=row[2], | |||||
status=row[4], | |||||
metadata=row[5], | |||||
snapshot=row[6], | |||||
vlorentz: This violates the abstraction, as `storage.py` now needs to hardcode the map between indices… | |||||
Done Inline ActionsI was interested in "performance". The alternative i did not comprehend fully yet interests me, i'll look into that ;) ardumont: I was interested in "performance".
I'm a bit wary of how many conversions we keep on doing… | |||||
Not Done Inline ActionsNothing to worry about performance-wise: In [1]: from swh.model.model import OriginVisitStatus, OriginVisit In [2]: import datetime In [3]: cols = ["origin", "visit", "date", "type", "status", "metadata", "snapshot"] In [4]: row = ("o", 42, datetime.datetime.now(), "t", "full", None, None) In [5]: %timeit d = dict(zip(cols, row)); OriginVisit(origin=d["origin"], visit=d["visit"], date=d["date"], type=d["type"]); OriginVisitStatus(origin=d["origin"], visit=d["visit"], date=d["date"], status=d["status"], metadata=d["metadata"], snapshot=d["snapshot"]) 100000 loops, best of 5: 16.6 µs per loop In [6]: %timeit OriginVisit(origin=row[0], visit=row[1], date=row[2], type=row[3]); OriginVisitStatus(origin=row[0], visit=row[1], date=row[2], status=row[4], metadata=row[5], snapshot=row[6]) 100000 loops, best of 5: 15.8 µs per loop Building the OriginVisit and OriginVisitStatus objects takes much longer (about 15µs) than the dict and tuple operations (respectively 100ns and 500ns in your version and mine) vlorentz: Nothing to worry about performance-wise:
```
In [1]: from swh.model.model import… | |||||
Not Done Inline Actionsnot to mention the cost of network operations, which trump all this vlorentz: not to mention the cost of network operations, which trump all this | |||||
Done Inline Actionscool, thanks ;) ardumont: cool, thanks ;) | |||||
) | |||||
return visit, visit_status | |||||
return None | return None | ||||
@timed | @timed | ||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def object_find_by_sha1_git(self, ids, db=None, cur=None): | def object_find_by_sha1_git(self, ids, db=None, cur=None): | ||||
ret = {id: [] for id in ids} | ret = {id: [] for id in ids} | ||||
for retval in db.object_find_by_sha1_git(ids, cur=cur): | for retval in db.object_find_by_sha1_git(ids, cur=cur): | ||||
▲ Show 20 Lines • Show All 363 Lines • Show Last 20 Lines |
This violates the abstraction, as storage.py now needs to hardcode the map between indices and names.
I recommend you keep the dict(zip()) and use dict access.
Or alternatively, make db.py return namedtuples (like cql.py does with Row)