Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/storage.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import contextlib | import contextlib | ||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import json | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from typing import Any, Dict, Iterable, List, Optional, Union | from typing import Any, Dict, Iterable, List, Optional, Union | ||||
import attr | import attr | ||||
import dateutil.parser | import dateutil.parser | ||||
import psycopg2 | import psycopg2 | ||||
▲ Show 20 Lines • Show All 1,213 Lines • ▼ Show 20 Lines | def refresh_stat_counters(self, db=None, cur=None): | ||||
] | ] | ||||
for key in keys: | for key in keys: | ||||
cur.execute("select * from swh_update_counter(%s)", (key,)) | cur.execute("select * from swh_update_counter(%s)", (key,)) | ||||
@timed | @timed | ||||
@db_transaction() | @db_transaction() | ||||
def origin_metadata_add( | def origin_metadata_add( | ||||
self, origin_url, ts, provider, tool, metadata, db=None, cur=None | self, | ||||
): | origin_url: str, | ||||
if isinstance(ts, str): | discovery_date: datetime.datetime, | ||||
ts = dateutil.parser.parse(ts) | authority: Dict[str, Any], | ||||
fetcher: Dict[str, Any], | |||||
db.origin_metadata_add(origin_url, ts, provider, tool, metadata, cur) | format: str, | ||||
metadata: bytes, | |||||
db=None, | |||||
cur=None, | |||||
) -> None: | |||||
authority_id = db.metadata_authority_get_id( | |||||
authority["type"], authority["url"], cur | |||||
) | |||||
if not authority_id: | |||||
raise StorageArgumentException(f"Unknown authority {authority}") | |||||
fetcher_id = db.metadata_fetcher_get_id( | |||||
fetcher["name"], fetcher["version"], cur | |||||
) | |||||
if not fetcher_id: | |||||
ardumont: nitpick: you could move the check just after the get.
There is no point in fetching the fetcher… | |||||
raise StorageArgumentException(f"Unknown fetcher {fetcher}") | |||||
db.origin_metadata_add( | |||||
origin_url, discovery_date, authority_id, fetcher_id, format, metadata, cur | |||||
) | |||||
send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add") | send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add") | ||||
@timed | @timed | ||||
@db_transaction_generator(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None): | def origin_metadata_get( | ||||
for line in db.origin_metadata_get_by(origin_url, provider_type, cur): | self, | ||||
yield dict(zip(db.origin_metadata_get_cols, line)) | origin_url: str, | ||||
authority: Dict[str, str], | |||||
after: Optional[datetime.datetime] = None, | |||||
limit: Optional[int] = None, | |||||
db=None, | |||||
cur=None, | |||||
) -> List[Dict[str, Any]]: | |||||
authority_id = db.metadata_authority_get_id( | |||||
authority["type"], authority["url"], cur | |||||
) | |||||
if not authority_id: | |||||
return [] | |||||
results = [] | |||||
for line in db.origin_metadata_get(origin_url, authority_id, after, limit, cur): | |||||
row = dict(zip(db.origin_metadata_get_cols, line)) | |||||
results.append( | |||||
{ | |||||
"origin_url": row.pop("origin.url"), | |||||
"authority": { | |||||
"type": row.pop("metadata_authority.type"), | |||||
"url": row.pop("metadata_authority.url"), | |||||
}, | |||||
"fetcher": { | |||||
"name": row.pop("metadata_fetcher.name"), | |||||
"version": row.pop("metadata_fetcher.version"), | |||||
}, | |||||
**row, | |||||
} | |||||
) | |||||
return results | |||||
@timed | @timed | ||||
@db_transaction() | @db_transaction() | ||||
def tool_add(self, tools, db=None, cur=None): | def metadata_fetcher_add( | ||||
db.mktemp_tool(cur) | self, name: str, version: str, metadata: Dict[str, Any], db=None, cur=None | ||||
with convert_validation_exceptions(): | ) -> None: | ||||
db.copy_to(tools, "tmp_tool", ["name", "version", "configuration"], cur) | db.metadata_fetcher_add(name, version, metadata) | ||||
tools = db.tool_add_from_temp(cur) | send_metric("metadata_fetcher:add", count=1, method_name="metadata_fetcher") | ||||
results = [dict(zip(db.tool_cols, line)) for line in tools] | |||||
send_metric("tool:add", count=len(results), method_name="tool_add") | |||||
return results | |||||
@timed | @timed | ||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def tool_get(self, tool, db=None, cur=None): | def metadata_fetcher_get( | ||||
tool_conf = tool["configuration"] | self, name: str, version: str, db=None, cur=None | ||||
if isinstance(tool_conf, dict): | ) -> Optional[Dict[str, Any]]: | ||||
Not Done Inline Actionsrow = db.metadata_fetcher_get(name, version, cur) ardumont: ```
row = db.metadata_fetcher_get(name, version, cur)
``` | |||||
tool_conf = json.dumps(tool_conf) | row = db.metadata_fetcher_get(name, version, cur=cur) | ||||
if not row: | |||||
idx = db.tool_get(tool["name"], tool["version"], tool_conf) | |||||
if not idx: | |||||
return None | return None | ||||
return dict(zip(db.tool_cols, idx)) | return dict(zip(db.metadata_fetcher_cols, row)) | ||||
@timed | @timed | ||||
@db_transaction() | @db_transaction() | ||||
def metadata_provider_add( | def metadata_authority_add( | ||||
self, provider_name, provider_type, provider_url, metadata, db=None, cur=None | self, type: str, url: str, metadata: Dict[str, Any], db=None, cur=None | ||||
): | ) -> None: | ||||
result = db.metadata_provider_add( | db.metadata_authority_add(type, url, metadata, cur) | ||||
provider_name, provider_type, provider_url, metadata, cur | send_metric("metadata_authority:add", count=1, method_name="metadata_authority") | ||||
) | |||||
send_metric("metadata_provider:add", count=1, method_name="metadata_provider") | |||||
return result | |||||
@timed | |||||
@db_transaction() | |||||
def metadata_provider_get(self, provider_id, db=None, cur=None): | |||||
result = db.metadata_provider_get(provider_id) | |||||
if not result: | |||||
return None | |||||
return dict(zip(db.metadata_provider_cols, result)) | |||||
@timed | @timed | ||||
@db_transaction() | @db_transaction() | ||||
def metadata_provider_get_by(self, provider, db=None, cur=None): | def metadata_authority_get( | ||||
result = db.metadata_provider_get_by( | self, type: str, url: str, db=None, cur=None | ||||
provider["provider_name"], provider["provider_url"] | ) -> Optional[Dict[str, Any]]: | ||||
Not Done Inline Actionsrow = db.metadata_authority_get(type, url, cur) ardumont: ```
row = db.metadata_authority_get(type, url, cur)
``` | |||||
) | row = db.metadata_authority_get(type, url, cur=cur) | ||||
if not result: | if not row: | ||||
return None | return None | ||||
return dict(zip(db.metadata_provider_cols, result)) | return dict(zip(db.metadata_authority_cols, row)) | ||||
@timed | @timed | ||||
def diff_directories(self, from_dir, to_dir, track_renaming=False): | def diff_directories(self, from_dir, to_dir, track_renaming=False): | ||||
return diff.diff_directories(self, from_dir, to_dir, track_renaming) | return diff.diff_directories(self, from_dir, to_dir, track_renaming) | ||||
@timed | @timed | ||||
def diff_revisions(self, from_rev, to_rev, track_renaming=False): | def diff_revisions(self, from_rev, to_rev, track_renaming=False): | ||||
return diff.diff_revisions(self, from_rev, to_rev, track_renaming) | return diff.diff_revisions(self, from_rev, to_rev, track_renaming) | ||||
Show All 13 Lines |
nitpick: you could move the check just after the get.
There is no point in fetching the fetcher if the authority_id is not resolved.