diff --git a/swh/loader/core/converters.py b/swh/loader/core/converters.py --- a/swh/loader/core/converters.py +++ b/swh/loader/core/converters.py @@ -9,7 +9,7 @@ def content_for_storage(content, log=None, max_content_size=None, - origin_id=None): + origin_url=None): """Prepare content to be ready for storage Note: @@ -30,7 +30,7 @@ ret.pop('data', None) ret.update({'status': 'absent', 'reason': 'Content too large', - 'origin': origin_id}) + 'origin': origin_url}) return ret if 'data' not in ret: diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import datetime +import hashlib import logging import os import psycopg2 @@ -205,15 +206,16 @@ raise NotImplementedError def get_save_data_path(self): - """The path to which we save the data""" + """The path to which we archive the loader's raw data""" if not hasattr(self, '__save_data_path'): - origin_id = self.origin_id year = str(self.visit_date.year) + origin_url_hash = hashlib.sha1(self.origin['url']).hexdigest() + path = os.path.join( self.config['save_data_path'], - "%04d" % (origin_id % 10000), - "%08d" % origin_id, + 'sha1:' + origin_url_hash[0:2], + origin_url_hash, year, ) @@ -233,7 +235,7 @@ 'swh_num': 1, 'swh_id': log_id }) - origin_id = self.storage.origin_add_one(origin) + self.storage.origin_add_one(origin) self.log.debug('Done creating %s origin for %s' % (origin['type'], origin['url']), extra={ @@ -243,24 +245,23 @@ 'swh_id': log_id }) - return origin_id - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_origin_visit(self, origin_id, visit_date): + def send_origin_visit(self, visit_date): log_id = str(uuid.uuid4()) self.log.debug( 'Creating origin_visit for origin %s at time %s' % ( - origin_id, visit_date), + self.origin['url'], visit_date), extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin_visit', 'swh_num': 1, 'swh_id': log_id }) - origin_visit = self.storage.origin_visit_add(origin_id, visit_date) + origin_visit = self.storage.origin_visit_add( + self.origin['url'], visit_date) self.log.debug( 'Done Creating origin_visit for origin %s at time %s' % ( - origin_id, visit_date), + self.origin['url'], visit_date), extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin_visit', @@ -334,12 +335,12 @@ return provider_id @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_origin_metadata(self, origin_id, visit_date, provider_id, + def send_origin_metadata(self, visit_date, provider_id, tool_id, metadata): log_id = str(uuid.uuid4()) self.log.debug( 'Creating origin_metadata for origin %s at time %s with provider_id %s and tool_id %s' % ( # noqa - origin_id, visit_date, provider_id, tool_id), + self.origin['url'], visit_date, provider_id, tool_id), extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin_metadata', @@ -347,11 +348,11 @@ 'swh_id': log_id }) - self.storage.origin_metadata_add(origin_id, visit_date, provider_id, - tool_id, metadata) + self.storage.origin_metadata_add( + self.origin['url'], visit_date, provider_id, tool_id, metadata) self.log.debug( 'Done Creating origin_metadata for origin %s at time %s with provider %s and tool %s' % ( # noqa - origin_id, visit_date, provider_id, tool_id), + self.origin['url'], visit_date, provider_id, tool_id), extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin_metadata', @@ -360,21 +361,22 @@ }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def update_origin_visit(self, origin_id, visit, status): + def update_origin_visit(self, status): log_id = str(uuid.uuid4()) self.log.debug( 'Updating origin_visit for origin %s with status %s' % ( - origin_id, status), + self.origin['url'], status), extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin_visit', 'swh_num': 1, 'swh_id': log_id }) - self.storage.origin_visit_update(origin_id, visit, status) + self.storage.origin_visit_update( + self.origin['url'], self.visit, status) self.log.debug( 'Done updating origin_visit for origin %s with status %s' % ( - origin_id, status), + self.origin['url'], status), extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin_visit', @@ -486,7 +488,7 @@ def send_snapshot(self, snapshot): self.storage.snapshot_add([snapshot]) self.storage.origin_visit_update( - self.origin_id, self.visit, snapshot=snapshot['id']) + self.origin['url'], self.visit, snapshot=snapshot['id']) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def filter_missing_contents(self, contents): @@ -509,7 +511,7 @@ yield converters.content_for_storage( contents_per_key[key], max_content_size=max_content_size, - origin_id=self.origin_id, + origin_url=self.origin['url'], ) def bulk_send_contents(self, contents): @@ -665,7 +667,7 @@ send_in_packets(releases, self.send_releases, packet_size) def open_fetch_history(self): - return self.storage.fetch_history_start(self.origin_id) + return self.storage.fetch_history_start(self.origin['url']) def close_fetch_history_success(self, fetch_history_id): data = { @@ -747,7 +749,7 @@ @abstractmethod def prepare_origin_visit(self, *args, **kwargs): """First step executed by the loader to prepare origin and visit - references. Set/update self.origin, self.origin_id and + references. Set/update self.origin, and optionally self.origin_url, self.visit_date. """ @@ -758,17 +760,16 @@ self.visit references. """ - origin_id = self.origin.get('id') - if origin_id: - self.origin_id = origin_id - else: - self.origin_id = self.send_origin(self.origin) - self.origin['id'] = self.origin_id + origin = self.origin.copy() + if 'id' in origin: + # TODO: remove the condition when we finished migrating away + # from origin ids + del origin['id'] + self.send_origin(origin) if not self.visit_date: # now as default visit_date if not provided self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - self.origin_visit = self.send_origin_visit( - self.origin_id, self.visit_date) + self.origin_visit = self.send_origin_visit(self.visit_date) self.visit = self.origin_visit['visit'] @abstractmethod @@ -899,8 +900,7 @@ self.store_metadata() self.close_fetch_history_success(fetch_history_id) - self.update_origin_visit( - self.origin_id, self.visit, status=self.visit_status()) + self.update_origin_visit(status=self.visit_status()) self.post_load() except Exception: self.log.exception('Loading failure, updating to `partial` status', @@ -909,8 +909,7 @@ 'swh_task_kwargs': kwargs, }) self.close_fetch_history_failure(fetch_history_id) - self.update_origin_visit( - self.origin_id, self.visit, status='partial') + self.update_origin_visit(status='partial') self.post_load(success=False) return {'status': 'failed'} finally: diff --git a/swh/loader/core/tests/__init__.py b/swh/loader/core/tests/__init__.py --- a/swh/loader/core/tests/__init__.py +++ b/swh/loader/core/tests/__init__.py @@ -148,7 +148,7 @@ """ origin = self.storage.origin_get( dict(type=origin_type, url=origin_url)) - results = self.storage.origin_metadata_get_by(origin['id']) + results = self.storage.origin_metadata_get_by(origin['url']) self.assertEqual(len(results), 1, results) result = results[0] self.assertEqual(result['metadata'], expected_origin_metadata) diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py --- a/swh/loader/core/tests/test_converters.py +++ b/swh/loader/core/tests/test_converters.py @@ -83,13 +83,13 @@ expected_content = obj.copy() expected_content.pop('data') expected_content['status'] = 'absent' - expected_content['origin'] = 42 + expected_content['origin'] = 'http://example.org/' expected_content['reason'] = 'Content too large' # when content = converters.content_for_storage( obj, log, max_content_size=len(data) - 1, - origin_id=expected_content['origin'], + origin_url=expected_content['origin'], ) # then diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -30,7 +30,6 @@ origin = self.storage.origin_get( self._test_prepare_origin_visit_data['origin']) self.origin = origin - self.origin_id = origin['id'] self.origin_url = origin['url'] self.visit_date = datetime.datetime.utcnow() self.storage.origin_visit_add(origin['id'], self.visit_date) @@ -285,7 +284,7 @@ tool_id = self.loader.send_tool(self.in_tool) self.loader.send_origin_metadata( - self.loader.origin_id, self.loader.visit_date, provider_id, + self.loader.visit_date, provider_id, tool_id, {'test_metadata': 'foobar'}) self.assertOriginMetadataContains(