Page MenuHomeSoftware Heritage

D742.id2321.diff
No OneTemporary

D742.id2321.diff

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -160,7 +160,8 @@
It:
- filters out the non textual content
- - (optionally) filters out content already indexed (cf :callable:`range`)
+ - (optionally) filters out content already indexed (cf
+ :func:`indexed_contents_in_range`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -5,17 +5,20 @@
import unittest
-from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
-from swh.indexer.metadata_detector import detect_metadata
-from swh.indexer.metadata_detector import extract_minimal_metadata_dict
-from swh.indexer.metadata import ContentMetadataIndexer
-from swh.indexer.metadata import RevisionMetadataIndexer
-from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
-from swh.indexer.tests.test_utils import MockIndexerStorage
-
from swh.model.hashutil import hash_to_bytes
-from .test_utils import BASE_TEST_CONFIG
+from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
+from swh.indexer.metadata_detector import (
+ detect_metadata, extract_minimal_metadata_dict
+)
+from swh.indexer.metadata import (
+ ContentMetadataIndexer, RevisionMetadataIndexer
+)
+
+from .test_utils import (
+ MockObjStorage, MockStorage, MockIndexerStorage,
+ BASE_TEST_CONFIG
+)
class ContentMetadataTestIndexer(ContentMetadataIndexer):
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -4,62 +4,33 @@
# See top-level LICENSE file for more information
import time
-import logging
import unittest
-from celery import task
-
-from swh.indexer.metadata import OriginMetadataIndexer, \
- RevisionMetadataIndexer, ContentMetadataIndexer
-from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
-from swh.indexer.tests.test_utils import MockIndexerStorage
-from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer
-
-from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture
+from celery import task
from swh.model.hashutil import hash_to_bytes
-from .test_utils import BASE_TEST_CONFIG
-
-
-class OriginMetadataTestIndexer(OriginMetadataIndexer):
- def parse_config_file(self, *args, **kwargs):
- return {
- **BASE_TEST_CONFIG,
- 'tools': [],
- }
-
- def prepare(self):
- super().prepare()
- self.storage = MockStorage()
- self.idx_storage = MockIndexerStorage()
- self.objstorage = MockObjStorage()
-
+from swh.indexer.metadata import (
+ OriginMetadataIndexer, RevisionMetadataIndexer
+)
-class ContentMetadataTestIndexer(ContentMetadataIndexer):
- """Specific Metadata whose configuration is enough to satisfy the
- indexing tests.
- """
- def prepare(self):
- self.idx_storage = MockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
- self.tools = self.register_tools(self.config['tools'])
- self.tool = self.tools[0]
- self.results = []
+from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture
+from .test_utils import (
+ MockObjStorage, MockStorage, MockIndexerStorage,
+ BASE_TEST_CONFIG
+)
+from .test_origin_head import OriginHeadTestIndexer
+from .test_metadata import ContentMetadataTestIndexer
class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
-
ContentMetadataIndexer = ContentMetadataTestIndexer
- def prepare(self):
- self.config = {
- 'storage': {},
- 'objstorage': {},
- 'indexer_storage': {},
+ def parse_config_file(self, *args, **kwargs):
+ return {
+ **BASE_TEST_CONFIG,
'tools': {
'name': 'swh-metadata-detector',
'version': '0.0.2',
@@ -69,21 +40,38 @@
}
}
}
- self.storage = MockStorage()
+
+ def prepare(self):
+ super().prepare()
self.idx_storage = MockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
+ self.storage = MockStorage()
self.objstorage = MockObjStorage()
- self.tools = self.register_tools(self.config['tools'])
- self.tool = self.tools[0]
+
+
+revision_metadata_indexer = RevisionMetadataTestIndexer()
@task
def revision_metadata_test_task(*args, **kwargs):
- indexer = RevisionMetadataTestIndexer()
+ indexer = revision_metadata_indexer
indexer.run(*args, **kwargs)
return indexer.results
+class OriginMetadataTestIndexer(OriginMetadataIndexer):
+ def parse_config_file(self, *args, **kwargs):
+ return {
+ **BASE_TEST_CONFIG,
+ 'tools': []
+ }
+
+ def prepare(self):
+ super().prepare()
+ self.storage = MockStorage()
+ self.objstorage = MockObjStorage()
+ self.idx_storage = MockIndexerStorage()
+
+
@task
def origin_intrinsic_metadata_test_task(*args, **kwargs):
indexer = OriginMetadataTestIndexer()
@@ -149,18 +137,30 @@
rev_metadata = {
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'translated_metadata': metadata,
- 'indexer_configuration_id': 7,
}
origin_metadata = {
'origin_id': 54974445,
'from_revision': hash_to_bytes(
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'metadata': metadata,
- 'indexer_configuration_id': 7,
}
expected_results = [
- ('origin_intrinsic_metadata', True, [origin_metadata]),
- ('revision_metadata', True, [rev_metadata])]
+ ('revision_metadata', True, [rev_metadata]),
+ ('origin_intrinsic_metadata', True, [origin_metadata]),
+ ]
results = list(indexer.idx_storage.added_data)
+ for result in results:
+ metadata = result[2]
+ for item in metadata:
+ # cannot check those (generated ids)
+ del item['indexer_configuration_id']
+
+ from pprint import pprint
+ print('##### actual')
+ pprint(results)
+ # print()
+ # print('##### expected')
+ # pprint(expected_results)
+
self.assertCountEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -357,7 +357,15 @@
for item in self.revision_metadata.get(id_):
item = item.copy()
tool_id = item.pop('indexer_configuration_id')
- item['tool'] = self.tools[tool_id].copy()
+ if tool_id in self.tools:
+ item['tool'] = self.tools[tool_id].copy()
+ else: # HACK: this needs to be removed altogether
+ item['tool'] = {
+ 'id': tool_id,
+ 'name': tool_id[0],
+ 'version': tool_id[1],
+ 'configuration': tool_id[2],
+ }
yield item
def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 10:03 PM (3 h, 31 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225991

Event Timeline