Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124517
D1603.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D1603.diff
View Options
diff --git a/swh/loader/core/converters.py b/swh/loader/core/converters.py
--- a/swh/loader/core/converters.py
+++ b/swh/loader/core/converters.py
@@ -9,7 +9,7 @@
def content_for_storage(content, log=None, max_content_size=None,
- origin_id=None):
+ origin_url=None):
"""Prepare content to be ready for storage
Note:
@@ -30,7 +30,7 @@
ret.pop('data', None)
ret.update({'status': 'absent',
'reason': 'Content too large',
- 'origin': origin_id})
+ 'origin': origin_url})
return ret
if 'data' not in ret:
diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
--- a/swh/loader/core/loader.py
+++ b/swh/loader/core/loader.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
import logging
import os
import psycopg2
@@ -205,15 +206,16 @@
raise NotImplementedError
def get_save_data_path(self):
- """The path to which we save the data"""
+ """The path to which we archive the loader's raw data"""
if not hasattr(self, '__save_data_path'):
- origin_id = self.origin_id
year = str(self.visit_date.year)
+ origin_url_hash = hashlib.sha1(self.origin['url']).hexdigest()
+
path = os.path.join(
self.config['save_data_path'],
- "%04d" % (origin_id % 10000),
- "%08d" % origin_id,
+ 'sha1:' + origin_url_hash[0:2],
+ origin_url_hash,
year,
)
@@ -233,7 +235,7 @@
'swh_num': 1,
'swh_id': log_id
})
- origin_id = self.storage.origin_add_one(origin)
+ self.storage.origin_add_one(origin)
self.log.debug('Done creating %s origin for %s' % (origin['type'],
origin['url']),
extra={
@@ -243,24 +245,23 @@
'swh_id': log_id
})
- return origin_id
-
@retry(retry_on_exception=retry_loading, stop_max_attempt_number=3)
- def send_origin_visit(self, origin_id, visit_date):
+ def send_origin_visit(self, visit_date):
log_id = str(uuid.uuid4())
self.log.debug(
'Creating origin_visit for origin %s at time %s' % (
- origin_id, visit_date),
+ self.origin['url'], visit_date),
extra={
'swh_type': 'storage_send_start',
'swh_content_type': 'origin_visit',
'swh_num': 1,
'swh_id': log_id
})
- origin_visit = self.storage.origin_visit_add(origin_id, visit_date)
+ origin_visit = self.storage.origin_visit_add(
+ self.origin['url'], visit_date)
self.log.debug(
'Done Creating origin_visit for origin %s at time %s' % (
- origin_id, visit_date),
+ self.origin['url'], visit_date),
extra={
'swh_type': 'storage_send_end',
'swh_content_type': 'origin_visit',
@@ -334,12 +335,12 @@
return provider_id
@retry(retry_on_exception=retry_loading, stop_max_attempt_number=3)
- def send_origin_metadata(self, origin_id, visit_date, provider_id,
+ def send_origin_metadata(self, visit_date, provider_id,
tool_id, metadata):
log_id = str(uuid.uuid4())
self.log.debug(
'Creating origin_metadata for origin %s at time %s with provider_id %s and tool_id %s' % ( # noqa
- origin_id, visit_date, provider_id, tool_id),
+ self.origin['url'], visit_date, provider_id, tool_id),
extra={
'swh_type': 'storage_send_start',
'swh_content_type': 'origin_metadata',
@@ -347,11 +348,11 @@
'swh_id': log_id
})
- self.storage.origin_metadata_add(origin_id, visit_date, provider_id,
- tool_id, metadata)
+ self.storage.origin_metadata_add(
+ self.origin['url'], visit_date, provider_id, tool_id, metadata)
self.log.debug(
'Done Creating origin_metadata for origin %s at time %s with provider %s and tool %s' % ( # noqa
- origin_id, visit_date, provider_id, tool_id),
+ self.origin['url'], visit_date, provider_id, tool_id),
extra={
'swh_type': 'storage_send_end',
'swh_content_type': 'origin_metadata',
@@ -360,21 +361,22 @@
})
@retry(retry_on_exception=retry_loading, stop_max_attempt_number=3)
- def update_origin_visit(self, origin_id, visit, status):
+ def update_origin_visit(self, status):
log_id = str(uuid.uuid4())
self.log.debug(
'Updating origin_visit for origin %s with status %s' % (
- origin_id, status),
+ self.origin['url'], status),
extra={
'swh_type': 'storage_send_start',
'swh_content_type': 'origin_visit',
'swh_num': 1,
'swh_id': log_id
})
- self.storage.origin_visit_update(origin_id, visit, status)
+ self.storage.origin_visit_update(
+ self.origin['url'], self.visit, status)
self.log.debug(
'Done updating origin_visit for origin %s with status %s' % (
- origin_id, status),
+ self.origin['url'], status),
extra={
'swh_type': 'storage_send_end',
'swh_content_type': 'origin_visit',
@@ -486,7 +488,7 @@
def send_snapshot(self, snapshot):
self.storage.snapshot_add([snapshot])
self.storage.origin_visit_update(
- self.origin_id, self.visit, snapshot=snapshot['id'])
+ self.origin['url'], self.visit, snapshot=snapshot['id'])
@retry(retry_on_exception=retry_loading, stop_max_attempt_number=3)
def filter_missing_contents(self, contents):
@@ -509,7 +511,7 @@
yield converters.content_for_storage(
contents_per_key[key],
max_content_size=max_content_size,
- origin_id=self.origin_id,
+ origin_url=self.origin['url'],
)
def bulk_send_contents(self, contents):
@@ -665,7 +667,7 @@
send_in_packets(releases, self.send_releases, packet_size)
def open_fetch_history(self):
- return self.storage.fetch_history_start(self.origin_id)
+ return self.storage.fetch_history_start(self.origin['url'])
def close_fetch_history_success(self, fetch_history_id):
data = {
@@ -747,7 +749,7 @@
@abstractmethod
def prepare_origin_visit(self, *args, **kwargs):
"""First step executed by the loader to prepare origin and visit
- references. Set/update self.origin, self.origin_id and
+ references. Set/update self.origin, and
optionally self.origin_url, self.visit_date.
"""
@@ -758,17 +760,16 @@
self.visit references.
"""
- origin_id = self.origin.get('id')
- if origin_id:
- self.origin_id = origin_id
- else:
- self.origin_id = self.send_origin(self.origin)
- self.origin['id'] = self.origin_id
+ origin = self.origin.copy()
+ if 'id' in origin:
+ # TODO: remove the condition when we finished migrating away
+ # from origin ids
+ del origin['id']
+ self.send_origin(origin)
if not self.visit_date: # now as default visit_date if not provided
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
- self.origin_visit = self.send_origin_visit(
- self.origin_id, self.visit_date)
+ self.origin_visit = self.send_origin_visit(self.visit_date)
self.visit = self.origin_visit['visit']
@abstractmethod
@@ -899,8 +900,7 @@
self.store_metadata()
self.close_fetch_history_success(fetch_history_id)
- self.update_origin_visit(
- self.origin_id, self.visit, status=self.visit_status())
+ self.update_origin_visit(status=self.visit_status())
self.post_load()
except Exception:
self.log.exception('Loading failure, updating to `partial` status',
@@ -909,8 +909,7 @@
'swh_task_kwargs': kwargs,
})
self.close_fetch_history_failure(fetch_history_id)
- self.update_origin_visit(
- self.origin_id, self.visit, status='partial')
+ self.update_origin_visit(status='partial')
self.post_load(success=False)
return {'status': 'failed'}
finally:
diff --git a/swh/loader/core/tests/__init__.py b/swh/loader/core/tests/__init__.py
--- a/swh/loader/core/tests/__init__.py
+++ b/swh/loader/core/tests/__init__.py
@@ -148,7 +148,7 @@
"""
origin = self.storage.origin_get(
dict(type=origin_type, url=origin_url))
- results = self.storage.origin_metadata_get_by(origin['id'])
+ results = self.storage.origin_metadata_get_by(origin['url'])
self.assertEqual(len(results), 1, results)
result = results[0]
self.assertEqual(result['metadata'], expected_origin_metadata)
diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py
--- a/swh/loader/core/tests/test_converters.py
+++ b/swh/loader/core/tests/test_converters.py
@@ -83,13 +83,13 @@
expected_content = obj.copy()
expected_content.pop('data')
expected_content['status'] = 'absent'
- expected_content['origin'] = 42
+ expected_content['origin'] = 'http://example.org/'
expected_content['reason'] = 'Content too large'
# when
content = converters.content_for_storage(
obj, log, max_content_size=len(data) - 1,
- origin_id=expected_content['origin'],
+ origin_url=expected_content['origin'],
)
# then
diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py
--- a/swh/loader/core/tests/test_loader.py
+++ b/swh/loader/core/tests/test_loader.py
@@ -30,7 +30,6 @@
origin = self.storage.origin_get(
self._test_prepare_origin_visit_data['origin'])
self.origin = origin
- self.origin_id = origin['id']
self.origin_url = origin['url']
self.visit_date = datetime.datetime.utcnow()
self.storage.origin_visit_add(origin['id'], self.visit_date)
@@ -285,7 +284,7 @@
tool_id = self.loader.send_tool(self.in_tool)
self.loader.send_origin_metadata(
- self.loader.origin_id, self.loader.visit_date, provider_id,
+ self.loader.visit_date, provider_id,
tool_id, {'test_metadata': 'foobar'})
self.assertOriginMetadataContains(
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 12:47 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215995
Attached To
D1603: Use origin urls instead of origin ids (swh-loader-core)
Event Timeline
Log In to Comment