Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/from_disk.py
Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines | def get_object(self, oid): | ||||
# some we need to check ourselves | # some we need to check ourselves | ||||
self._check(obj) | self._check(obj) | ||||
except KeyError: | except KeyError: | ||||
_id = oid.decode('utf-8') | _id = oid.decode('utf-8') | ||||
self.log.warn('object %s not found, skipping' % _id, | self.log.warn('object %s not found, skipping' % _id, | ||||
extra={ | extra={ | ||||
'swh_type': 'swh_loader_git_missing_object', | 'swh_type': 'swh_loader_git_missing_object', | ||||
'swh_object_id': _id, | 'swh_object_id': _id, | ||||
'origin_id': self.origin_id, | 'origin_url': self.origin['url'], | ||||
}) | }) | ||||
return None | return None | ||||
except ObjectFormatException: | except ObjectFormatException: | ||||
_id = oid.decode('utf-8') | _id = oid.decode('utf-8') | ||||
self.log.warn('object %s malformed, skipping' % _id, | self.log.warn('object %s malformed, skipping' % _id, | ||||
extra={ | extra={ | ||||
'swh_type': 'swh_loader_git_missing_object', | 'swh_type': 'swh_loader_git_missing_object', | ||||
'swh_object_id': _id, | 'swh_object_id': _id, | ||||
'origin_id': self.origin_id, | 'origin_url': self.origin['url'], | ||||
}) | }) | ||||
return None | return None | ||||
except EmptyFileException: | except EmptyFileException: | ||||
_id = oid.decode('utf-8') | _id = oid.decode('utf-8') | ||||
self.log.warn('object %s corrupted (empty file), skipping' % _id, | self.log.warn('object %s corrupted (empty file), skipping' % _id, | ||||
extra={ | extra={ | ||||
'swh_type': 'swh_loader_git_missing_object', | 'swh_type': 'swh_loader_git_missing_object', | ||||
'swh_object_id': _id, | 'swh_object_id': _id, | ||||
'origin_id': self.origin_id, | 'origin_url': self.origin['url'], | ||||
}) | }) | ||||
else: | else: | ||||
return obj | return obj | ||||
def fetch_data(self): | def fetch_data(self): | ||||
"""Fetch the data from the data source""" | """Fetch the data from the data source""" | ||||
self.previous_snapshot = self.storage.snapshot_get_latest( | self.previous_snapshot = self.storage.snapshot_get_latest( | ||||
self.origin_id | self.origin['url'] | ||||
) | ) | ||||
type_to_ids = defaultdict(list) | type_to_ids = defaultdict(list) | ||||
for oid in self.iter_objects(): | for oid in self.iter_objects(): | ||||
obj = self.get_object(oid) | obj = self.get_object(oid) | ||||
if not obj: | if not obj: | ||||
continue | continue | ||||
type_name = obj.type_name | type_name = obj.type_name | ||||
Show All 16 Lines | def get_contents(self): | ||||
missing_contents = set(self.storage.content_missing( | missing_contents = set(self.storage.content_missing( | ||||
self.get_content_ids(), 'sha1_git')) | self.get_content_ids(), 'sha1_git')) | ||||
for oid in missing_contents: | for oid in missing_contents: | ||||
yield converters.dulwich_blob_to_content( | yield converters.dulwich_blob_to_content( | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log, | self.repo[hashutil.hash_to_bytehex(oid)], log=self.log, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
origin_id=self.origin_id) | origin_url=self.origin['url']) | ||||
def has_directories(self): | def has_directories(self): | ||||
"""Checks whether we need to load directories""" | """Checks whether we need to load directories""" | ||||
return bool(self.type_to_ids[b'tree']) | return bool(self.type_to_ids[b'tree']) | ||||
def get_directory_ids(self): | def get_directory_ids(self): | ||||
"""Get the directory identifiers from the git repository""" | """Get the directory identifiers from the git repository""" | ||||
return (hashutil.hash_to_bytes(id.decode()) | return (hashutil.hash_to_bytes(id.decode()) | ||||
▲ Show 20 Lines • Show All 194 Lines • Show Last 20 Lines |