Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163854
D3327.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D3327.id.diff
View Options
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -763,12 +763,17 @@
return [{"url": orig.url,} for orig in origins[offset : offset + limit]]
- def origin_add(self, origins: Iterable[Origin]) -> List[Dict]:
- results = []
- for origin in origins:
- self.origin_add_one(origin)
- results.append(origin.to_dict())
- return results
+ def origin_add(self, origins: Iterable[Origin]) -> Dict[str, int]:
+ known_origins = [
+ Origin.from_dict(d)
+ for d in self.origin_get([origin.to_dict() for origin in origins])
+ if d is not None
+ ]
+ to_add = [origin for origin in origins if origin not in known_origins]
+ self.journal_writer.origin_add(to_add)
+ for origin in to_add:
+ self._cql_runner.origin_add_one(origin)
+ return {"origin:add": len(to_add)}
def origin_add_one(self, origin: Origin) -> str:
known_origin = self.origin_get_one(origin.to_dict())
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -753,11 +753,15 @@
)
)
- def origin_add(self, origins: Iterable[Origin]) -> List[Dict]:
- origins = copy.deepcopy(list(origins))
+ def origin_add(self, origins: Iterable[Origin]) -> Dict[str, int]:
+ origins = list(origins)
+ added = 0
for origin in origins:
- self.origin_add_one(origin)
- return [origin.to_dict() for origin in origins]
+ if origin.url not in self._origins:
+ self.origin_add_one(origin)
+ added += 1
+
+ return {"origin:add": added}
def origin_add_one(self, origin: Origin) -> str:
if origin.url not in self._origins:
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -1047,7 +1047,7 @@
...
@remote_api_endpoint("origin/add_multi")
- def origin_add(self, origins: Iterable[Origin]) -> List[Dict]:
+ def origin_add(self, origins: Iterable[Origin]) -> Dict[str, int]:
"""Add origins to the storage
Args:
@@ -1058,7 +1058,9 @@
- url (bytes): the url the origin points to
Returns:
- list: given origins as dict updated with their id
+ Summary dict of keys with associated count as values
+
+ origin:add: Count of object actually stored in db
"""
...
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1093,26 +1093,28 @@
@timed
@db_transaction()
- def origin_add(self, origins: Iterable[Origin], db=None, cur=None) -> List[Dict]:
- origins = list(origins)
- for origin in origins:
- self.origin_add_one(origin, db=db, cur=cur)
-
- return [o.to_dict() for o in origins]
+ def origin_add(
+ self, origins: Iterable[Origin], db=None, cur=None
+ ) -> Dict[str, int]:
+ urls = [o.url for o in origins]
+ known_origins = set(url for (url,) in db.origin_get_by_url(urls, cur))
+ # use lists here to keep origins sorted; some tests depend on this
+ to_add = [url for url in urls if url not in known_origins]
+
+ self.journal_writer.origin_add([Origin(url=url) for url in to_add])
+ added = 0
+ for url in to_add:
+ if db.origin_add(url, cur):
+ added += 1
+ return {"origin:add": added}
@timed
@db_transaction()
def origin_add_one(self, origin: Origin, db=None, cur=None) -> str:
- origin_row = list(db.origin_get_by_url([origin.url], cur))[0]
- origin_url = dict(zip(db.origin_cols, origin_row))["url"]
- if origin_url:
- return origin_url
-
- self.journal_writer.origin_add([origin])
-
- url = db.origin_add(origin.url, cur)
- send_metric("origin:add", count=1, method_name="origin_add_one")
- return url
+ stats = self.origin_add([origin])
+ if stats.get("origin:add", 0):
+ send_metric("origin:add", count=1, method_name="origin_add_one")
+ return origin.url
@db_transaction(statement_timeout=500)
def stat_counters(self, db=None, cur=None):
diff --git a/swh/storage/tests/algos/test_origin.py b/swh/storage/tests/algos/test_origin.py
--- a/swh/storage/tests/algos/test_origin.py
+++ b/swh/storage/tests/algos/test_origin.py
@@ -26,7 +26,12 @@
def test_iter_origins(swh_storage):
- origins = swh_storage.origin_add([{"url": "bar"}, {"url": "qux"}, {"url": "quuz"},])
+ origins = [
+ {"url": "bar"},
+ {"url": "qux"},
+ {"url": "quuz"},
+ ]
+ assert swh_storage.origin_add(origins) == {"origin:add": 3}
assert_list_eq(iter_origins(swh_storage), origins)
assert_list_eq(iter_origins(swh_storage, batch_size=1), origins)
assert_list_eq(iter_origins(swh_storage, batch_size=2), origins)
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -1314,22 +1314,21 @@
origin0 = swh_storage.origin_get([data.origin])[0]
assert origin0 is None
- origin1, origin2 = swh_storage.origin_add([data.origin, data.origin2])
+ stats = swh_storage.origin_add([data.origin, data.origin2])
+ assert stats == {"origin:add": 2}
actual_origin = swh_storage.origin_get([{"url": data.origin["url"],}])[0]
- assert actual_origin["url"] == origin1["url"]
+ assert actual_origin["url"] == data.origin["url"]
actual_origin2 = swh_storage.origin_get([{"url": data.origin2["url"],}])[0]
- assert actual_origin2["url"] == origin2["url"]
-
- if "id" in actual_origin:
- del actual_origin["id"]
- del actual_origin2["id"]
+ assert actual_origin2["url"] == data.origin2["url"]
- assert list(swh_storage.journal_writer.journal.objects) == [
- ("origin", Origin.from_dict(actual_origin)),
- ("origin", Origin.from_dict(actual_origin2)),
- ]
+ assert set(swh_storage.journal_writer.journal.objects) == set(
+ [
+ ("origin", Origin.from_dict(actual_origin)),
+ ("origin", Origin.from_dict(actual_origin2)),
+ ]
+ )
swh_storage.refresh_stat_counters()
assert swh_storage.stat_counters()["origin"] == 2
@@ -1339,40 +1338,47 @@
yield data.origin
yield data.origin2
- origin1, origin2 = swh_storage.origin_add(_ori_gen())
+ stats = swh_storage.origin_add(_ori_gen())
+ assert stats == {"origin:add": 2}
actual_origin = swh_storage.origin_get([{"url": data.origin["url"],}])[0]
- assert actual_origin["url"] == origin1["url"]
+ assert actual_origin["url"] == data.origin["url"]
actual_origin2 = swh_storage.origin_get([{"url": data.origin2["url"],}])[0]
- assert actual_origin2["url"] == origin2["url"]
+ assert actual_origin2["url"] == data.origin2["url"]
if "id" in actual_origin:
del actual_origin["id"]
del actual_origin2["id"]
- assert list(swh_storage.journal_writer.journal.objects) == [
- ("origin", Origin.from_dict(actual_origin)),
- ("origin", Origin.from_dict(actual_origin2)),
- ]
+ assert set(swh_storage.journal_writer.journal.objects) == set(
+ [
+ ("origin", Origin.from_dict(actual_origin)),
+ ("origin", Origin.from_dict(actual_origin2)),
+ ]
+ )
swh_storage.refresh_stat_counters()
assert swh_storage.stat_counters()["origin"] == 2
def test_origin_add_twice(self, swh_storage):
add1 = swh_storage.origin_add([data.origin, data.origin2])
- assert list(swh_storage.journal_writer.journal.objects) == [
- ("origin", Origin.from_dict(data.origin)),
- ("origin", Origin.from_dict(data.origin2)),
- ]
+ assert set(swh_storage.journal_writer.journal.objects) == set(
+ [
+ ("origin", Origin.from_dict(data.origin)),
+ ("origin", Origin.from_dict(data.origin2)),
+ ]
+ )
+ assert add1 == {"origin:add": 2}
add2 = swh_storage.origin_add([data.origin, data.origin2])
- assert list(swh_storage.journal_writer.journal.objects) == [
- ("origin", Origin.from_dict(data.origin)),
- ("origin", Origin.from_dict(data.origin2)),
- ]
-
- assert add1 == add2
+ assert set(swh_storage.journal_writer.journal.objects) == set(
+ [
+ ("origin", Origin.from_dict(data.origin)),
+ ("origin", Origin.from_dict(data.origin2)),
+ ]
+ )
+ assert add2 == {"origin:add": 0}
def test_origin_add_validation(self, swh_storage):
"""Incorrect formatted origin should fail the validation
@@ -1394,12 +1400,18 @@
def test_origin_get(self, swh_storage):
assert swh_storage.origin_get(data.origin) is None
+ assert swh_storage.origin_get([data.origin]) == [None]
swh_storage.origin_add_one(data.origin)
actual_origin0 = swh_storage.origin_get([{"url": data.origin["url"]}])
assert len(actual_origin0) == 1
assert actual_origin0[0]["url"] == data.origin["url"]
+ actual_origins = swh_storage.origin_get(
+ [{"url": data.origin["url"]}, {"url": "not://exists"}]
+ )
+ assert actual_origins == [{"url": data.origin["url"]}, None]
+
def _generate_random_visits(self, nb_visits=100, start=0, end=7):
"""Generate random visits within the last 2 months (to avoid
computations)
@@ -3234,7 +3246,7 @@
origin = data.origin
fetcher = data.metadata_fetcher
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
swh_storage.metadata_fetcher_add(**fetcher)
swh_storage.metadata_authority_add(**authority)
@@ -3253,7 +3265,7 @@
origin = data.origin
fetcher = data.metadata_fetcher
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
new_origin_metadata2 = {
**data.origin_metadata2,
@@ -3278,7 +3290,7 @@
origin = data.origin
fetcher = data.metadata_fetcher
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
swh_storage.metadata_fetcher_add(**fetcher)
swh_storage.metadata_authority_add(**authority)
@@ -3296,8 +3308,7 @@
fetcher2 = data.metadata_fetcher2
origin_url1 = data.origin["url"]
origin_url2 = data.origin2["url"]
- swh_storage.origin_add([data.origin])
- swh_storage.origin_add([data.origin2])
+ assert swh_storage.origin_add([data.origin, data.origin2]) == {"origin:add": 2}
origin1_metadata1 = data.origin_metadata
origin1_metadata2 = data.origin_metadata2
@@ -3334,7 +3345,7 @@
origin = data.origin
fetcher = data.metadata_fetcher
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
swh_storage.metadata_fetcher_add(**fetcher)
swh_storage.metadata_authority_add(**authority)
@@ -3368,7 +3379,7 @@
origin = data.origin
fetcher = data.metadata_fetcher
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
swh_storage.metadata_fetcher_add(**fetcher)
swh_storage.metadata_authority_add(**authority)
@@ -3393,7 +3404,7 @@
fetcher1 = data.metadata_fetcher
fetcher2 = data.metadata_fetcher2
authority = data.metadata_authority
- swh_storage.origin_add([origin])[0]
+ assert swh_storage.origin_add([origin]) == {"origin:add": 1}
swh_storage.metadata_fetcher_add(**fetcher1)
swh_storage.metadata_fetcher_add(**fetcher2)
diff --git a/swh/storage/validate.py b/swh/storage/validate.py
--- a/swh/storage/validate.py
+++ b/swh/storage/validate.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import contextlib
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, TypeVar, Union
+from typing import Dict, Iterable, Iterator, Optional, Tuple, Type, TypeVar, Union
from swh.model.model import (
SkippedContent,
@@ -138,7 +138,7 @@
def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
return self.storage.origin_visit_add(visits)
- def origin_add(self, origins: Iterable[Union[Dict, Origin]]) -> List:
+ def origin_add(self, origins: Iterable[Union[Dict, Origin]]) -> Dict[str, int]:
return self.storage.origin_add([dict_converter(Origin, o) for o in origins])
def origin_add_one(self, origin: Union[Dict, Origin]) -> int:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 4:48 PM (2 h, 47 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221600
Attached To
D3327: Refactor the Storage.add_origin API
Event Timeline
Log In to Comment