Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/storage.py
Show First 20 Lines • Show All 917 Lines • ▼ Show 20 Lines | def snapshot_add(self, snapshots, origin=None, visit=None, | ||||
snapshot:add: Count of object actually stored in db | snapshot:add: Count of object actually stored in db | ||||
""" | """ | ||||
if origin: | if origin: | ||||
if not visit: | if not visit: | ||||
raise TypeError( | raise TypeError( | ||||
'snapshot_add expects one argument (or, as a legacy ' | 'snapshot_add expects one argument (or, as a legacy ' | ||||
'behavior, three arguments), not two') | 'behavior, three arguments), not two') | ||||
if isinstance(snapshots, int): | if isinstance(snapshots, (int, str)): | ||||
# Called by legacy code that uses the new api/client.py | # Called by legacy code that uses the new api/client.py | ||||
(origin_id, visit_id, snapshots) = \ | (origin_id, visit_id, snapshots) = \ | ||||
(snapshots, origin, [visit]) | (snapshots, origin, [visit]) | ||||
else: | else: | ||||
# Called by legacy code that uses the old api/client.py | # Called by legacy code that uses the old api/client.py | ||||
origin_id = origin | origin_id = origin | ||||
visit_id = visit | visit_id = visit | ||||
snapshots = [snapshots] | snapshots = [snapshots] | ||||
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines | def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, | ||||
* **id**: identifier of the snapshot | * **id**: identifier of the snapshot | ||||
* **branches**: a dict of branches contained in the snapshot | * **branches**: a dict of branches contained in the snapshot | ||||
whose keys are the branches' names. | whose keys are the branches' names. | ||||
* **next_branch**: the name of the first branch not returned | * **next_branch**: the name of the first branch not returned | ||||
or :const:`None` if the snapshot has less than 1000 | or :const:`None` if the snapshot has less than 1000 | ||||
branches. | branches. | ||||
""" | """ | ||||
if isinstance(origin, int): | if isinstance(origin, int): | ||||
origin = self.origin_get({'id': origin}, db=db, cur=cur)['url'] | origin = self.origin_get({'id': origin}, db=db, cur=cur) | ||||
if not origin: | |||||
return | |||||
origin = origin['url'] | |||||
origin_visit = self.origin_visit_get_latest( | origin_visit = self.origin_visit_get_latest( | ||||
origin, allowed_statuses=allowed_statuses, require_snapshot=True, | origin, allowed_statuses=allowed_statuses, require_snapshot=True, | ||||
db=db, cur=cur) | db=db, cur=cur) | ||||
if origin_visit and origin_visit['snapshot']: | if origin_visit and origin_visit['snapshot']: | ||||
snapshot = self.snapshot_get( | snapshot = self.snapshot_get( | ||||
origin_visit['snapshot'], db=db, cur=cur) | origin_visit['snapshot'], db=db, cur=cur) | ||||
if not snapshot: | if not snapshot: | ||||
▲ Show 20 Lines • Show All 203 Lines • ▼ Show 20 Lines | def origin_visit_upsert(self, visits, db=None, cur=None): | ||||
metadata: Data associated to the visit | metadata: Data associated to the visit | ||||
snapshot (sha1_git): identifier of the snapshot to add to | snapshot (sha1_git): identifier of the snapshot to add to | ||||
the visit | the visit | ||||
""" | """ | ||||
visits = copy.deepcopy(visits) | visits = copy.deepcopy(visits) | ||||
for visit in visits: | for visit in visits: | ||||
if isinstance(visit['date'], str): | if isinstance(visit['date'], str): | ||||
visit['date'] = dateutil.parser.parse(visit['date']) | visit['date'] = dateutil.parser.parse(visit['date']) | ||||
if isinstance(visit['origin'], str): | |||||
visit['origin'] = \ | |||||
self.origin_get({'url': visit['origin']})['id'] | |||||
if self.journal_writer: | if self.journal_writer: | ||||
for visit in visits: | for visit in visits: | ||||
visit = visit.copy() | visit = visit.copy() | ||||
origin = self.origin_get( | origin = self.origin_get( | ||||
[{'id': visit['origin']}], db=db, cur=cur)[0] | [{'id': visit['origin']}], db=db, cur=cur)[0] | ||||
visit['origin'] = origin | visit['origin'] = origin | ||||
if visit.get('type') is None: | if visit.get('type') is None: | ||||
visit['type'] = origin['type'] | visit['type'] = origin['type'] | ||||
del visit['origin']['id'] | del visit['origin']['id'] | ||||
self.journal_writer.write_addition('origin_visit', visit) | self.journal_writer.write_addition('origin_visit', visit) | ||||
for visit in visits: | for visit in visits: | ||||
# TODO: upsert them all in a single query | # TODO: upsert them all in a single query | ||||
db.origin_visit_upsert(**visit, cur=cur) | db.origin_visit_upsert(**visit, cur=cur) | ||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Retrieve all the origin's visit's information. | """Retrieve all the origin's visit's information. | ||||
Args: | Args: | ||||
origin (int): The occurrence's origin (identifier). | origin (Union[int,str]): The occurrence's origin (identifier/URL). | ||||
last_visit: Starting point from which listing the next visits | last_visit: Starting point from which listing the next visits | ||||
Default to None | Default to None | ||||
limit (int): Number of results to return from the last visit. | limit (int): Number of results to return from the last visit. | ||||
Default to None | Default to None | ||||
Yields: | Yields: | ||||
List of visits. | List of visits. | ||||
Show All 17 Lines | def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): | ||||
Args: | Args: | ||||
origin (str): The occurrence's origin (URL). | origin (str): The occurrence's origin (URL). | ||||
target (datetime): target timestamp | target (datetime): target timestamp | ||||
Returns: | Returns: | ||||
A visit. | A visit. | ||||
""" | """ | ||||
origin = self.origin_get([{'url': origin}], db=db, cur=cur)[0]['id'] | origin = self.origin_get([{'url': origin}], db=db, cur=cur)[0] | ||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | ||||
if line: | if line: | ||||
return dict(zip(db.origin_visit_get_cols, line)) | return dict(zip(db.origin_visit_get_cols, line)) | ||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_visit_get_by(self, origin, visit, db=None, cur=None): | def origin_visit_get_by(self, origin, visit, db=None, cur=None): | ||||
"""Retrieve origin visit's information. | """Retrieve origin visit's information. | ||||
Args: | Args: | ||||
origin: The occurrence's origin (identifier). | origin: The occurrence's origin (identifier). | ||||
Returns: | Returns: | ||||
The information on that particular (origin, visit) or None if | The information on that particular (origin, visit) or None if | ||||
it does not exist | it does not exist | ||||
""" | """ | ||||
if isinstance(origin, str): | if isinstance(origin, str): | ||||
origin = self.origin_get({'url': origin}, db=db, cur=cur)['id'] | origin = self.origin_get({'url': origin}, db=db, cur=cur) | ||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
ori_visit = db.origin_visit_get(origin, visit, cur) | ori_visit = db.origin_visit_get(origin, visit, cur) | ||||
if not ori_visit: | if not ori_visit: | ||||
return None | return None | ||||
return dict(zip(db.origin_visit_get_cols, ori_visit)) | return dict(zip(db.origin_visit_get_cols, ori_visit)) | ||||
@db_transaction(statement_timeout=4000) | @db_transaction(statement_timeout=4000) | ||||
def origin_visit_get_latest( | def origin_visit_get_latest( | ||||
Show All 18 Lines | def origin_visit_get_latest( | ||||
visit: origin visit id | visit: origin visit id | ||||
type: type of loader used for the visit | type: type of loader used for the visit | ||||
date: timestamp of such visit | date: timestamp of such visit | ||||
status: Visit's new status | status: Visit's new status | ||||
metadata: Data associated to the visit | metadata: Data associated to the visit | ||||
snapshot (Optional[sha1_git]): identifier of the snapshot | snapshot (Optional[sha1_git]): identifier of the snapshot | ||||
associated to the visit | associated to the visit | ||||
""" | """ | ||||
origin = self.origin_get({'url': origin}, db=db, cur=cur)['id'] | origin = self.origin_get({'url': origin}, db=db, cur=cur) | ||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
origin_visit = db.origin_visit_get_latest( | origin_visit = db.origin_visit_get_latest( | ||||
origin, allowed_statuses=allowed_statuses, | origin, allowed_statuses=allowed_statuses, | ||||
require_snapshot=require_snapshot, cur=cur) | require_snapshot=require_snapshot, cur=cur) | ||||
if origin_visit: | if origin_visit: | ||||
return dict(zip(db.origin_visit_get_cols, origin_visit)) | return dict(zip(db.origin_visit_get_cols, origin_visit)) | ||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
▲ Show 20 Lines • Show All 220 Lines • ▼ Show 20 Lines | def origin_add_one(self, origin, db=None, cur=None): | ||||
return db.origin_add(origin['type'], origin['url'], cur) | return db.origin_add(origin['type'], origin['url'], cur) | ||||
@db_transaction() | @db_transaction() | ||||
def fetch_history_start(self, origin_id, db=None, cur=None): | def fetch_history_start(self, origin_id, db=None, cur=None): | ||||
"""Add an entry for origin origin_id in fetch_history. Returns the id | """Add an entry for origin origin_id in fetch_history. Returns the id | ||||
of the added fetch_history entry | of the added fetch_history entry | ||||
""" | """ | ||||
if isinstance(origin_id, str): | if isinstance(origin_id, str): | ||||
origin_id = \ | origin = \ | ||||
self.origin_get([{'url': origin_id}], db=db, cur=cur)[0]['id'] | self.origin_get([{'url': origin_id}], db=db, cur=cur) | ||||
if not origin: | |||||
return | |||||
origin_id = origin[0]['id'] | |||||
fetch_history = { | fetch_history = { | ||||
'origin': origin_id, | 'origin': origin_id, | ||||
'date': datetime.datetime.now(tz=datetime.timezone.utc), | 'date': datetime.datetime.now(tz=datetime.timezone.utc), | ||||
} | } | ||||
return db.create_fetch_history(fetch_history, cur) | return db.create_fetch_history(fetch_history, cur) | ||||
@db_transaction() | @db_transaction() | ||||
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines |