Differential D1750 Diff 5899 swh/indexer/metadata.py

Changeset View

Standalone View

swh/indexer/metadata.py

# Copyright (C) 2017-2018 The Software Heritage developers		# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from copy import deepcopy		from copy import deepcopy

		from swh.core.utils import grouper

from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer		from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
from swh.indexer.origin_head import OriginHeadIndexer		from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.metadata_dictionary import MAPPINGS		from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata		from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict		from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY		from swh.indexer.storage import INDEXER_CFG_KEY

from swh.model import hashutil		from swh.model import hashutil


		REVISION_GET_BATCH_SIZE = 10


class ContentMetadataIndexer(ContentIndexer):		class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer		"""Content-level indexer

This indexer is in charge of:		This indexer is in charge of:

- filtering out content already indexed in content_metadata		- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1		- reading content from objstorage with the content's id sha1
- computing metadata by given context		- computing metadata by given context
▲ Show 20 Lines • Show All 245 Lines • ▼ Show 20 Lines	def index_list(self, origin_urls):
[{'url': url} for url in origin_urls])		[{'url': url} for url in origin_urls])
for origin in origins:		for origin in origins:
head_result = self.origin_head_indexer.index(origin['url'])		head_result = self.origin_head_indexer.index(origin['url'])
if head_result:		if head_result:
head_result['origin_id'] = origin['id']		head_result['origin_id'] = origin['id']
origins_with_head.append(origin)		origins_with_head.append(origin)
head_rev_ids.append(head_result['revision_id'])		head_rev_ids.append(head_result['revision_id'])

head_revs = list(self.storage.revision_get(head_rev_ids))		head_revs = []
		groups = grouper(head_rev_ids, REVISION_GET_BATCH_SIZE)
		for group in groups:
		ardumontUnsubmitted Not Done Inline Actions i'm wondering whether that should be storage side or not ;) (what i suggested earlier was possibly to do that storage side) Well, we'll see what this does. ardumont: i'm wondering whether that should be storage side or not ;) (what i suggested earlier was…
		vlorentzAuthorUnsubmitted Done Inline Actions Doing it on the storage side is exactly the same as increasing the timeout. vlorentz: Doing it on the storage side is exactly the same as increasing the timeout.
		head_revs.extend(self.storage.revision_get(group))
assert len(head_revs) == len(head_rev_ids)		assert len(head_revs) == len(head_rev_ids)

results = []		results = []
for (origin, rev) in zip(origins_with_head, head_revs):		for (origin, rev) in zip(origins_with_head, head_revs):
if not rev:		if not rev:
self.log.warning('Missing head revision of origin %r',		self.log.warning('Missing head revision of origin %r',
origin['url'])		origin['url'])
continue		continue
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines