Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from copy import deepcopy | from copy import deepcopy | ||||
from swh.core.utils import grouper | |||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.origin_head import OriginHeadIndexer | from swh.indexer.origin_head import OriginHeadIndexer | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_detector import extract_minimal_metadata_dict | from swh.indexer.metadata_detector import extract_minimal_metadata_dict | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
REVISION_GET_BATCH_SIZE = 10 | |||||
class ContentMetadataIndexer(ContentIndexer): | class ContentMetadataIndexer(ContentIndexer): | ||||
"""Content-level indexer | """Content-level indexer | ||||
This indexer is in charge of: | This indexer is in charge of: | ||||
- filtering out content already indexed in content_metadata | - filtering out content already indexed in content_metadata | ||||
- reading content from objstorage with the content's id sha1 | - reading content from objstorage with the content's id sha1 | ||||
- computing metadata by given context | - computing metadata by given context | ||||
▲ Show 20 Lines • Show All 245 Lines • ▼ Show 20 Lines | def index_list(self, origin_urls): | ||||
[{'url': url} for url in origin_urls]) | [{'url': url} for url in origin_urls]) | ||||
for origin in origins: | for origin in origins: | ||||
head_result = self.origin_head_indexer.index(origin['url']) | head_result = self.origin_head_indexer.index(origin['url']) | ||||
if head_result: | if head_result: | ||||
head_result['origin_id'] = origin['id'] | head_result['origin_id'] = origin['id'] | ||||
origins_with_head.append(origin) | origins_with_head.append(origin) | ||||
head_rev_ids.append(head_result['revision_id']) | head_rev_ids.append(head_result['revision_id']) | ||||
head_revs = list(self.storage.revision_get(head_rev_ids)) | head_revs = [] | ||||
groups = grouper(head_rev_ids, REVISION_GET_BATCH_SIZE) | |||||
for group in groups: | |||||
ardumont: i'm wondering whether that should be storage side or not ;)
(what i suggested earlier was… | |||||
Done Inline ActionsDoing it on the storage side is exactly the same as increasing the timeout. vlorentz: Doing it on the storage side is exactly the same as increasing the timeout. | |||||
head_revs.extend(self.storage.revision_get(group)) | |||||
assert len(head_revs) == len(head_rev_ids) | assert len(head_revs) == len(head_rev_ids) | ||||
results = [] | results = [] | ||||
for (origin, rev) in zip(origins_with_head, head_revs): | for (origin, rev) in zip(origins_with_head, head_revs): | ||||
if not rev: | if not rev: | ||||
self.log.warning('Missing head revision of origin %r', | self.log.warning('Missing head revision of origin %r', | ||||
origin['url']) | origin['url']) | ||||
continue | continue | ||||
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines |
i'm wondering whether that should be storage side or not ;)
(what i suggested earlier was possibly to do that storage side)
Well, we'll see what this does.