Changeset View
Standalone View
swh/storage/storage.py
Show First 20 Lines • Show All 206 Lines • ▼ Show 20 Lines | def _content_add_metadata(self, db, cur, | ||||
.get(e.diag.constraint_name) | .get(e.diag.constraint_name) | ||||
raise HashCollision(colliding_hash_name) from None | raise HashCollision(colliding_hash_name) from None | ||||
else: | else: | ||||
raise | raise | ||||
if content_without_data: | if content_without_data: | ||||
content_without_data = \ | content_without_data = \ | ||||
[cont.copy() for cont in content_without_data] | [cont.copy() for cont in content_without_data] | ||||
origins = db.origin_get_by_url( | origin_ids = db.origin_id_get_by_url( | ||||
[cont.get('origin') for cont in content_without_data], | [cont.get('origin') for cont in content_without_data], | ||||
cur=cur) | cur=cur) | ||||
for (cont, origin) in zip(content_without_data, origins): | for (cont, origin_id) in zip(content_without_data, origin_ids): | ||||
origin = dict(zip(db.origin_cols, origin)) | |||||
if 'origin' in cont: | if 'origin' in cont: | ||||
cont['origin'] = origin['id'] | cont['origin'] = origin_id | ||||
db.mktemp('skipped_content', cur) | db.mktemp('skipped_content', cur) | ||||
db.copy_to(content_without_data, 'tmp_skipped_content', | db.copy_to(content_without_data, 'tmp_skipped_content', | ||||
db.skipped_content_keys, cur) | db.skipped_content_keys, cur) | ||||
# move metadata in place | # move metadata in place | ||||
db.skipped_content_add_from_temp(cur) | db.skipped_content_add_from_temp(cur) | ||||
@db_transaction() | @db_transaction() | ||||
▲ Show 20 Lines • Show All 865 Lines • ▼ Show 20 Lines | def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, | ||||
order of their names. | order of their names. | ||||
.. warning:: At most 1000 branches contained in the snapshot will be | .. warning:: At most 1000 branches contained in the snapshot will be | ||||
returned for performance reasons. In order to browse the whole | returned for performance reasons. In order to browse the whole | ||||
set of branches, the method :meth:`snapshot_get_branches` | set of branches, the method :meth:`snapshot_get_branches` | ||||
should be used instead. | should be used instead. | ||||
Args: | Args: | ||||
origin (Union[str,int]): the origin's URL or identifier | origin (str): the origin's URL | ||||
ardumont: origin (str): the origin's url | |||||
allowed_statuses (list of str): list of visit statuses considered | allowed_statuses (list of str): list of visit statuses considered | ||||
to find the latest snapshot for the visit. For instance, | to find the latest snapshot for the visit. For instance, | ||||
``allowed_statuses=['full']`` will only consider visits that | ``allowed_statuses=['full']`` will only consider visits that | ||||
have successfully run to completion. | have successfully run to completion. | ||||
Returns: | Returns: | ||||
dict: a dict with three keys: | dict: a dict with three keys: | ||||
* **id**: identifier of the snapshot | * **id**: identifier of the snapshot | ||||
* **branches**: a dict of branches contained in the snapshot | * **branches**: a dict of branches contained in the snapshot | ||||
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines | def snapshot_get_branches(self, snapshot_id, branches_from=b'', | ||||
return None | return None | ||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_add(self, origin, date, type, | def origin_visit_add(self, origin, date, type, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Add an origin_visit for the origin at ts with status 'ongoing'. | """Add an origin_visit for the origin at ts with status 'ongoing'. | ||||
Args: | Args: | ||||
origin (Union[int,str]): visited origin's identifier or URL | origin (str): visited origin's identifier or URL | ||||
date (Union[str,datetime]): timestamp of such visit | date (Union[str,datetime]): timestamp of such visit | ||||
type (str): the type of loader used for the visit (hg, git, ...) | type (str): the type of loader used for the visit (hg, git, ...) | ||||
Returns: | Returns: | ||||
dict: dictionary with keys origin and visit where: | dict: dictionary with keys origin and visit where: | ||||
- origin: origin identifier | - origin: origin identifier | ||||
- visit: the visit identifier for the new visit occurrence | - visit: the visit identifier for the new visit occurrence | ||||
""" | """ | ||||
if isinstance(origin, str): | origin_url = origin | ||||
origin = self.origin_get({'url': origin}, db=db, cur=cur) | origin = self.origin_get({'url': origin_url}, db=db, cur=cur) | ||||
origin_id = origin['id'] | |||||
else: | |||||
origin = self.origin_get({'id': origin}, db=db, cur=cur) | |||||
origin_id = origin['id'] | |||||
if isinstance(date, str): | if isinstance(date, str): | ||||
# FIXME: Converge on iso8601 at some point | # FIXME: Converge on iso8601 at some point | ||||
date = dateutil.parser.parse(date) | date = dateutil.parser.parse(date) | ||||
visit_id = db.origin_visit_add(origin_id, date, type, cur) | visit_id = db.origin_visit_add(origin_url, date, type, cur) | ||||
if self.journal_writer: | if self.journal_writer: | ||||
# We can write to the journal only after inserting to the | # We can write to the journal only after inserting to the | ||||
# DB, because we want the id of the visit | # DB, because we want the id of the visit | ||||
del origin['id'] | |||||
self.journal_writer.write_addition('origin_visit', { | self.journal_writer.write_addition('origin_visit', { | ||||
'origin': origin, 'date': date, 'type': type, | 'origin': origin, 'date': date, 'type': type, | ||||
'visit': visit_id, | 'visit': visit_id, | ||||
'status': 'ongoing', 'metadata': None, 'snapshot': None}) | 'status': 'ongoing', 'metadata': None, 'snapshot': None}) | ||||
return { | return { | ||||
'origin': origin_id, | 'origin': origin_url, | ||||
'visit': visit_id, | 'visit': visit_id, | ||||
} | } | ||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_update(self, origin, visit_id, status=None, | def origin_visit_update(self, origin, visit_id, status=None, | ||||
metadata=None, snapshot=None, | metadata=None, snapshot=None, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Update an origin_visit's status. | """Update an origin_visit's status. | ||||
Args: | Args: | ||||
origin (Union[int,str]): visited origin's identifier or URL | origin (str): visited origin's URL | ||||
visit_id: Visit's id | visit_id: Visit's id | ||||
status: Visit's new status | status: Visit's new status | ||||
metadata: Data associated to the visit | metadata: Data associated to the visit | ||||
snapshot (sha1_git): identifier of the snapshot to add to | snapshot (sha1_git): identifier of the snapshot to add to | ||||
the visit | the visit | ||||
Returns: | Returns: | ||||
None | None | ||||
""" | """ | ||||
if isinstance(origin, str): | origin_url = origin | ||||
origin_id = self.origin_get({'url': origin}, db=db, cur=cur)['id'] | visit = db.origin_visit_get(origin_url, visit_id, cur=cur) | ||||
else: | |||||
origin_id = origin | |||||
visit = db.origin_visit_get(origin_id, visit_id, cur=cur) | |||||
if not visit: | if not visit: | ||||
raise ValueError('Invalid visit_id for this origin.') | raise ValueError('Invalid visit_id for this origin.') | ||||
visit = dict(zip(db.origin_visit_get_cols, visit)) | visit = dict(zip(db.origin_visit_get_cols, visit)) | ||||
updates = {} | updates = {} | ||||
if status and status != visit['status']: | if status and status != visit['status']: | ||||
updates['status'] = status | updates['status'] = status | ||||
if metadata and metadata != visit['metadata']: | if metadata and metadata != visit['metadata']: | ||||
updates['metadata'] = metadata | updates['metadata'] = metadata | ||||
if snapshot and snapshot != visit['snapshot']: | if snapshot and snapshot != visit['snapshot']: | ||||
updates['snapshot'] = snapshot | updates['snapshot'] = snapshot | ||||
if updates: | if updates: | ||||
if self.journal_writer: | if self.journal_writer: | ||||
origin = self.origin_get( | origin = self.origin_get( | ||||
[{'id': origin_id}], db=db, cur=cur)[0] | [{'url': origin_url}], db=db, cur=cur)[0] | ||||
Not Done Inline Actionsdo you still need to do that now? I mean, do we still need to pass the full origin to the journal (including our origin id)? ardumont: do you still need to do that now?
I mean, do we still need to pass the full origin to the… | |||||
Done Inline ActionsThat doesn't include the id, but it includes the type, which is currently needed by origin_add vlorentz: That doesn't include the id, but it includes the type, which is currently needed by origin_add | |||||
Not Done Inline Actionsright! ardumont: right! | |||||
del origin['id'] | |||||
self.journal_writer.write_update('origin_visit', { | self.journal_writer.write_update('origin_visit', { | ||||
**visit, **updates, 'origin': origin}) | **visit, **updates, 'origin': origin}) | ||||
db.origin_visit_update(origin_id, visit_id, updates, cur) | db.origin_visit_update(origin_url, visit_id, updates, cur) | ||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_upsert(self, visits, db=None, cur=None): | def origin_visit_upsert(self, visits, db=None, cur=None): | ||||
"""Add a origin_visits with a specific id and with all its data. | """Add a origin_visits with a specific id and with all its data. | ||||
If there is already an origin_visit with the same | If there is already an origin_visit with the same | ||||
`(origin_id, visit_id)`, overwrites it. | `(origin_id, visit_id)`, overwrites it. | ||||
Args: | Args: | ||||
Show All 14 Lines | def origin_visit_upsert(self, visits, db=None, cur=None): | ||||
visit['origin'] = \ | visit['origin'] = \ | ||||
self.origin_get([visit['origin']], db=db, cur=cur)[0] | self.origin_get([visit['origin']], db=db, cur=cur)[0] | ||||
if self.journal_writer: | if self.journal_writer: | ||||
for visit in visits: | for visit in visits: | ||||
visit = copy.deepcopy(visit) | visit = copy.deepcopy(visit) | ||||
if visit.get('type') is None: | if visit.get('type') is None: | ||||
visit['type'] = visit['origin']['type'] | visit['type'] = visit['origin']['type'] | ||||
del visit['origin']['id'] | |||||
self.journal_writer.write_addition('origin_visit', visit) | self.journal_writer.write_addition('origin_visit', visit) | ||||
for visit in visits: | for visit in visits: | ||||
visit['origin'] = visit['origin']['id'] | visit['origin'] = visit['origin']['url'] | ||||
# TODO: upsert them all in a single query | # TODO: upsert them all in a single query | ||||
db.origin_visit_upsert(**visit, cur=cur) | db.origin_visit_upsert(**visit, cur=cur) | ||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Retrieve all the origin's visit's information. | """Retrieve all the origin's visit's information. | ||||
Args: | Args: | ||||
origin (Union[int,str]): The occurrence's origin (identifier/URL). | origin (str): The visited origin | ||||
last_visit: Starting point from which listing the next visits | last_visit: Starting point from which listing the next visits | ||||
Default to None | Default to None | ||||
limit (int): Number of results to return from the last visit. | limit (int): Number of results to return from the last visit. | ||||
Default to None | Default to None | ||||
Yields: | Yields: | ||||
List of visits. | List of visits. | ||||
""" | """ | ||||
if isinstance(origin, str): | |||||
origin = self.origin_get([{'url': origin}], db=db, cur=cur)[0] | |||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
for line in db.origin_visit_get_all( | for line in db.origin_visit_get_all( | ||||
origin, last_visit=last_visit, limit=limit, cur=cur): | origin, last_visit=last_visit, limit=limit, cur=cur): | ||||
data = dict(zip(db.origin_visit_get_cols, line)) | data = dict(zip(db.origin_visit_get_cols, line)) | ||||
yield data | yield data | ||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): | def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): | ||||
"""Retrieves the origin visit whose date is closest to the provided | """Retrieves the origin visit whose date is closest to the provided | ||||
timestamp. | timestamp. | ||||
In case of a tie, the visit with largest id is selected. | In case of a tie, the visit with largest id is selected. | ||||
Args: | Args: | ||||
origin (str): The occurrence's origin (URL). | origin (str): The occurrence's origin (URL). | ||||
target (datetime): target timestamp | target (datetime): target timestamp | ||||
Returns: | Returns: | ||||
A visit. | A visit. | ||||
""" | """ | ||||
origin = self.origin_get([{'url': origin}], db=db, cur=cur)[0] | |||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | ||||
if line: | if line: | ||||
return dict(zip(db.origin_visit_get_cols, line)) | return dict(zip(db.origin_visit_get_cols, line)) | ||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_visit_get_by(self, origin, visit, db=None, cur=None): | def origin_visit_get_by(self, origin, visit, db=None, cur=None): | ||||
"""Retrieve origin visit's information. | """Retrieve origin visit's information. | ||||
Args: | Args: | ||||
origin: The occurrence's origin (identifier). | origin: The occurrence's origin (identifier). | ||||
Returns: | Returns: | ||||
The information on that particular (origin, visit) or None if | The information on that particular (origin, visit) or None if | ||||
it does not exist | it does not exist | ||||
""" | """ | ||||
if isinstance(origin, str): | |||||
origin = self.origin_get({'url': origin}, db=db, cur=cur) | |||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
ori_visit = db.origin_visit_get(origin, visit, cur) | ori_visit = db.origin_visit_get(origin, visit, cur) | ||||
if not ori_visit: | if not ori_visit: | ||||
return None | return None | ||||
return dict(zip(db.origin_visit_get_cols, ori_visit)) | return dict(zip(db.origin_visit_get_cols, ori_visit)) | ||||
@db_transaction(statement_timeout=4000) | @db_transaction(statement_timeout=4000) | ||||
def origin_visit_get_latest( | def origin_visit_get_latest( | ||||
Show All 18 Lines | def origin_visit_get_latest( | ||||
visit: origin visit id | visit: origin visit id | ||||
type: type of loader used for the visit | type: type of loader used for the visit | ||||
date: timestamp of such visit | date: timestamp of such visit | ||||
status: Visit's new status | status: Visit's new status | ||||
metadata: Data associated to the visit | metadata: Data associated to the visit | ||||
snapshot (Optional[sha1_git]): identifier of the snapshot | snapshot (Optional[sha1_git]): identifier of the snapshot | ||||
associated to the visit | associated to the visit | ||||
""" | """ | ||||
origin = self.origin_get({'url': origin}, db=db, cur=cur) | |||||
if not origin: | |||||
return | |||||
origin = origin['id'] | |||||
origin_visit = db.origin_visit_get_latest( | origin_visit = db.origin_visit_get_latest( | ||||
origin, allowed_statuses=allowed_statuses, | origin, allowed_statuses=allowed_statuses, | ||||
require_snapshot=require_snapshot, cur=cur) | require_snapshot=require_snapshot, cur=cur) | ||||
if origin_visit: | if origin_visit: | ||||
return dict(zip(db.origin_visit_get_cols, origin_visit)) | return dict(zip(db.origin_visit_get_cols, origin_visit)) | ||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def object_find_by_sha1_git(self, ids, db=None, cur=None): | def object_find_by_sha1_git(self, ids, db=None, cur=None): | ||||
Show All 16 Lines | def object_find_by_sha1_git(self, ids, db=None, cur=None): | ||||
for retval in db.object_find_by_sha1_git(ids, cur=cur): | for retval in db.object_find_by_sha1_git(ids, cur=cur): | ||||
if retval[1]: | if retval[1]: | ||||
ret[retval[0]].append(dict(zip(db.object_find_by_sha1_git_cols, | ret[retval[0]].append(dict(zip(db.object_find_by_sha1_git_cols, | ||||
retval))) | retval))) | ||||
return ret | return ret | ||||
origin_keys = ['id', 'url'] | |||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_get(self, origins, db=None, cur=None): | def origin_get(self, origins, db=None, cur=None): | ||||
"""Return origins, either all identified by their ids or all | """Return origins, either all identified by their ids or all | ||||
identified by tuples (type, url). | identified by tuples (type, url). | ||||
If the url is given and the type is omitted, one of the origins with | If the url is given and the type is omitted, one of the origins with | ||||
that url is returned. | that url is returned. | ||||
Args: | Args: | ||||
origin: a list of dictionaries representing the individual | origin: a list of dictionaries representing the individual | ||||
origins to find. | origins to find. | ||||
These dicts have either the key url: | These dicts have the key url: | ||||
- url (bytes): the url the origin points to | - url (bytes): the url the origin points to | ||||
or the id: | |||||
- id: the origin id | |||||
Returns: | Returns: | ||||
dict: the origin dictionary with the keys: | dict: the origin dictionary with the keys: | ||||
- id: origin's id | - id: origin's id | ||||
- url: origin's url | - url: origin's url | ||||
Raises: | Raises: | ||||
ValueError: if the url or the id don't exist. | ValueError: if the url or the id don't exist. | ||||
""" | """ | ||||
if isinstance(origins, dict): | if isinstance(origins, dict): | ||||
# Old API | # Old API | ||||
return_single = True | return_single = True | ||||
origins = [origins] | origins = [origins] | ||||
elif len(origins) == 0: | elif len(origins) == 0: | ||||
return [] | return [] | ||||
else: | else: | ||||
return_single = False | return_single = False | ||||
origin_ids = [origin.get('id') for origin in origins] | origin_urls = [origin['url'] for origin in origins] | ||||
origin_urls = [origin.get('url') for origin in origins] | |||||
if any(origin_ids): | |||||
# Lookup per ID | |||||
if all(origin_ids): | |||||
results = db.origin_get_by_id(origin_ids, cur) | |||||
else: | |||||
raise ValueError( | |||||
'Either all origins or none at all should have an "id".') | |||||
elif any(origin_urls): | |||||
# Lookup per type + URL | |||||
if all(origin_urls): | |||||
results = db.origin_get_by_url(origin_urls, cur) | results = db.origin_get_by_url(origin_urls, cur) | ||||
else: | |||||
raise ValueError( | |||||
'Either all origins or none at all should have ' | |||||
'an "url" key.') | |||||
else: # unsupported lookup | |||||
raise ValueError('Origin must have either id or url.') | |||||
results = [dict(zip(self.origin_keys, result)) | results = [dict(zip(db.origin_cols, result)) | ||||
for result in results] | for result in results] | ||||
if return_single: | if return_single: | ||||
assert len(results) == 1 | assert len(results) == 1 | ||||
if results[0]['id'] is not None: | if results[0]['url'] is not None: | ||||
return results[0] | return results[0] | ||||
else: | else: | ||||
return None | return None | ||||
else: | else: | ||||
return [None if res['id'] is None else res for res in results] | return [None if res['url'] is None else res for res in results] | ||||
Not Done Inline Actionswow, it's been a while since i looked to the origin-get implementation... It's kind of a mess to read... That has nothing to do with the diff though... In any case, last return statement, is res['url'] is None possible at all here now? I guess yes because we don't know what the url inputted can be? ardumont: wow, it's been a while since i looked to the `origin-get` implementation...
too many… | |||||
Done Inline Actions
Let's not make this diff even bigger.
Yes, if the origin is not already in storage, because missing rows are returned by pg as (null, null) because it's a left join instead of inner join. vlorentz: > That has nothing to do with the diff though...
> but since we are touching it...
Let's not… | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_get_range(self, origin_from=1, origin_count=100, | def origin_get_range(self, origin_from=1, origin_count=100, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Retrieve ``origin_count`` origins whose ids are greater | """Retrieve ``origin_count`` origins whose ids are greater | ||||
or equal than ``origin_from``. | or equal than ``origin_from``. | ||||
Origins are sorted by id before retrieving them. | Origins are sorted by id before retrieving them. | ||||
Args: | Args: | ||||
origin_from (int): the minimum id of origins to retrieve | origin_from (int): the minimum id of origins to retrieve | ||||
origin_count (int): the maximum number of origins to retrieve | origin_count (int): the maximum number of origins to retrieve | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`. | by :meth:`swh.storage.storage.Storage.origin_get`. | ||||
""" | """ | ||||
for origin in db.origin_get_range(origin_from, origin_count, cur): | for origin in db.origin_get_range(origin_from, origin_count, cur): | ||||
yield dict(zip(self.origin_keys, origin)) | yield dict(zip(db.origin_get_range_cols, origin)) | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_search(self, url_pattern, offset=0, limit=50, | def origin_search(self, url_pattern, offset=0, limit=50, | ||||
regexp=False, with_visit=False, db=None, cur=None): | regexp=False, with_visit=False, db=None, cur=None): | ||||
"""Search for origins whose urls contain a provided string pattern | """Search for origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
The search is performed in a case insensitive way. | The search is performed in a case insensitive way. | ||||
Args: | Args: | ||||
url_pattern (str): the string pattern to search for in origin urls | url_pattern (str): the string pattern to search for in origin urls | ||||
offset (int): number of found origins to skip before returning | offset (int): number of found origins to skip before returning | ||||
results | results | ||||
limit (int): the maximum number of found origins to return | limit (int): the maximum number of found origins to return | ||||
regexp (bool): if True, consider the provided pattern as a regular | regexp (bool): if True, consider the provided pattern as a regular | ||||
expression and return origins whose urls match it | expression and return origins whose urls match it | ||||
with_visit (bool): if True, filter out origins with no visit | with_visit (bool): if True, filter out origins with no visit | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`. | by :meth:`swh.storage.storage.Storage.origin_get`. | ||||
""" | """ | ||||
for origin in db.origin_search(url_pattern, offset, limit, | for origin in db.origin_search(url_pattern, offset, limit, | ||||
regexp, with_visit, cur): | regexp, with_visit, cur): | ||||
yield dict(zip(self.origin_keys, origin)) | yield dict(zip(db.origin_cols, origin)) | ||||
@db_transaction() | @db_transaction() | ||||
def origin_count(self, url_pattern, regexp=False, | def origin_count(self, url_pattern, regexp=False, | ||||
with_visit=False, db=None, cur=None): | with_visit=False, db=None, cur=None): | ||||
"""Count origins whose urls contain a provided string pattern | """Count origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
The pattern search in origin urls is performed in a case insensitive | The pattern search in origin urls is performed in a case insensitive | ||||
way. | way. | ||||
Show All 21 Lines | def origin_add(self, origins, db=None, cur=None): | ||||
- url (bytes): the url the origin points to | - url (bytes): the url the origin points to | ||||
Returns: | Returns: | ||||
list: given origins as dict updated with their id | list: given origins as dict updated with their id | ||||
""" | """ | ||||
origins = copy.deepcopy(origins) | origins = copy.deepcopy(origins) | ||||
for origin in origins: | for origin in origins: | ||||
origin['id'] = self.origin_add_one(origin, db=db, cur=cur) | self.origin_add_one(origin, db=db, cur=cur) | ||||
return origins | return origins | ||||
@db_transaction() | @db_transaction() | ||||
def origin_add_one(self, origin, db=None, cur=None): | def origin_add_one(self, origin, db=None, cur=None): | ||||
"""Add origin to the storage | """Add origin to the storage | ||||
Args: | Args: | ||||
origin: dictionary representing the individual origin to add. This | origin: dictionary representing the individual origin to add. This | ||||
dict has the following keys: | dict has the following keys: | ||||
- type (FIXME: enum TBD): the origin type ('git', 'wget', ...) | - type (FIXME: enum TBD): the origin type ('git', 'wget', ...) | ||||
- url (bytes): the url the origin points to | - url (bytes): the url the origin points to | ||||
Returns: | Returns: | ||||
the id of the added origin, or of the identical one that already | the id of the added origin, or of the identical one that already | ||||
exists. | exists. | ||||
""" | """ | ||||
origin_id = list(db.origin_get_by_url( | origin_row = list(db.origin_get_by_url([origin['url']], cur))[0] | ||||
[origin['url']], cur))[0][0] | origin_url = dict(zip(db.origin_cols, origin_row))['url'] | ||||
if origin_id: | if origin_url: | ||||
return origin_id | return origin_url | ||||
if self.journal_writer: | if self.journal_writer: | ||||
self.journal_writer.write_addition('origin', origin) | self.journal_writer.write_addition('origin', origin) | ||||
return db.origin_add(origin['url'], cur) | return db.origin_add(origin['url'], cur) | ||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def stat_counters(self, db=None, cur=None): | def stat_counters(self, db=None, cur=None): | ||||
Show All 23 Lines | def refresh_stat_counters(self, db=None, cur=None): | ||||
'revision_history', | 'revision_history', | ||||
'skipped_content', | 'skipped_content', | ||||
'snapshot'] | 'snapshot'] | ||||
for key in keys: | for key in keys: | ||||
cur.execute('select * from swh_update_counter(%s)', (key,)) | cur.execute('select * from swh_update_counter(%s)', (key,)) | ||||
@db_transaction() | @db_transaction() | ||||
def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, | def origin_metadata_add(self, origin_url, ts, provider, tool, metadata, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
""" Add an origin_metadata for the origin at ts with provenance and | """ Add an origin_metadata for the origin at ts with provenance and | ||||
metadata. | metadata. | ||||
Args: | Args: | ||||
origin_id (int): the origin's id for which the metadata is added | origin_url (str): the origin url for which the metadata is added | ||||
ts (datetime): timestamp of the found metadata | ts (datetime): timestamp of the found metadata | ||||
provider (int): the provider of metadata (ex:'hal') | provider (int): the provider of metadata (ex:'hal') | ||||
tool (int): tool used to extract metadata | tool (int): tool used to extract metadata | ||||
metadata (jsonb): the metadata retrieved at the time and location | metadata (jsonb): the metadata retrieved at the time and location | ||||
Returns: | |||||
id (int): the origin_metadata unique id | |||||
""" | """ | ||||
if isinstance(origin_id, str): | |||||
origin = self.origin_get({'url': origin_id}, db=db, cur=cur) | |||||
if not origin: | |||||
return | |||||
origin_id = origin['id'] | |||||
if isinstance(ts, str): | if isinstance(ts, str): | ||||
ts = dateutil.parser.parse(ts) | ts = dateutil.parser.parse(ts) | ||||
return db.origin_metadata_add(origin_id, ts, provider, tool, | db.origin_metadata_add(origin_url, ts, provider, tool, | ||||
metadata, cur) | metadata, cur) | ||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_metadata_get_by(self, origin_id, provider_type=None, db=None, | def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Retrieve list of all origin_metadata entries for the origin_id | """Retrieve list of all origin_metadata entries for the origin_id | ||||
Args: | Args: | ||||
origin_id (int): the unique origin identifier | origin_url (str): the origin's URL | ||||
provider_type (str): (optional) type of provider | provider_type (str): (optional) type of provider | ||||
Returns: | Returns: | ||||
list of dicts: the origin_metadata dictionary with the keys: | list of dicts: the origin_metadata dictionary with the keys: | ||||
- origin_id (int): origin's id | - origin_id (int): origin's id | ||||
- discovery_date (datetime): timestamp of discovery | - discovery_date (datetime): timestamp of discovery | ||||
- tool_id (int): metadata's extracting tool | - tool_id (int): metadata's extracting tool | ||||
- metadata (jsonb) | - metadata (jsonb) | ||||
- provider_id (int): metadata's provider | - provider_id (int): metadata's provider | ||||
- provider_name (str) | - provider_name (str) | ||||
- provider_type (str) | - provider_type (str) | ||||
- provider_url (str) | - provider_url (str) | ||||
""" | """ | ||||
if isinstance(origin_id, str): | for line in db.origin_metadata_get_by(origin_url, provider_type, cur): | ||||
origin = self.origin_get({'url': origin_id}, db=db, cur=cur) | |||||
if not origin: | |||||
return | |||||
origin_id = origin['id'] | |||||
for line in db.origin_metadata_get_by(origin_id, provider_type, cur): | |||||
yield dict(zip(db.origin_metadata_get_cols, line)) | yield dict(zip(db.origin_metadata_get_cols, line)) | ||||
@db_transaction() | @db_transaction() | ||||
def tool_add(self, tools, db=None, cur=None): | def tool_add(self, tools, db=None, cur=None): | ||||
"""Add new tools to the storage. | """Add new tools to the storage. | ||||
Args: | Args: | ||||
tools (iterable of :class:`dict`): Tool information to add to | tools (iterable of :class:`dict`): Tool information to add to | ||||
▲ Show 20 Lines • Show All 145 Lines • Show Last 20 Lines |
origin (str): the origin's url