Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696909
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
View Options
diff --git a/swh/loader/debian/loader.py b/swh/loader/debian/loader.py
index 958905f..edf1d50 100644
--- a/swh/loader/debian/loader.py
+++ b/swh/loader/debian/loader.py
@@ -1,369 +1,377 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import glob
import os
import subprocess
import shutil
import tempfile
from dateutil.parser import parse as parse_date
from debian.changelog import Changelog
from debian.deb822 import Dsc
from swh.core import hashutil
from swh.loader.dir.git.git import (
walk_and_compute_sha1_from_directory, ROOT_TREE_KEY)
from . import converters
DEBIAN_KEYRINGS = glob.glob('/usr/share/keyrings/*')
def extract_src_pkg(dsc_path, destdir, log=None):
"""Extract a Debian source package to a given directory
Note that after extraction the target directory will be the root of the
extracted package, rather than containing it.
Args:
dsc_path: path to .dsc file
destdir: directory where to extract the package
log: a logging.Logger object
Returns:
None
"""
if log:
- log.debug('extract Debian source package %s' % dsc_path)
+ log.debug('extract Debian source package %s in %s' %
+ (dsc_path, destdir.decode('utf-8')), extra={
+ 'swh_type': 'deb_extract',
+ 'swh_dsc': dsc_path,
+ 'swh_destdir': destdir.decode('utf-8'),
+ })
destdir_tmp = b''.join([destdir, b'.tmp'])
logfile = b''.join([destdir, b'.log'])
cmd = ['dpkg-source',
'--no-copy', '--no-check',
'--ignore-bad-version',
'-x', dsc_path,
destdir_tmp]
with open(logfile, 'w') as stdout:
subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT)
os.rename(destdir_tmp, destdir)
def get_file_info(filepath):
"""Retrieve the original file information from the file at filepath.
Args:
filepath: the path to the original file
Returns:
A dict with the information about the original file:
name: the file name
sha1, sha1_git, sha256: the hashes for the original file
length: the length of the original file
"""
name = os.path.basename(filepath)
if isinstance(name, bytes):
name = name.decode('utf-8')
ret = {
'name': name,
}
hashes = hashutil.hashfile(filepath)
for hash in hashes:
ret[hash] = hashutil.hash_to_hex(hashes[hash])
ret['length'] = os.lstat(filepath).st_size
return ret
def get_gpg_info_signature(gpg_info):
"""Extract the signature date from a deb822.GpgInfo object
Args:
gpg_info: a deb822.GpgInfo object
Returns: a dictionary with the following keys:
signature_date: a timezone-aware datetime.DateTime object with the date
the signature was made
signature_keyid: the keyid of the signing key
signature_uid: the uid of the signing key, if found
"""
uid = None
if 'VALIDSIG' in gpg_info:
key_id = gpg_info['VALIDSIG'][0]
timestamp = gpg_info['VALIDSIG'][2]
for key in gpg_info.uidkeys:
if key in gpg_info:
uid = gpg_info[key][-1]
break
elif 'ERRSIG' in gpg_info:
key_id = gpg_info['ERRSIG'][0]
timestamp = gpg_info['ERRSIG'][4]
else:
raise ValueError('Cannot find signature in gpg_info '
'object. Keys: %s' % gpg_info.keys())
dt = datetime.datetime.utcfromtimestamp(int(timestamp))
dt = dt.replace(tzinfo=datetime.timezone.utc)
ret = {
'committer_date': dt,
'committer_offset': 0,
'committer_keyid': key_id,
}
ret.update(converters.uid_to_person(uid, 'committer'))
return ret
def get_package_metadata(dsc_path, extracted_path, log=None):
"""Get the package metadata from the source package at dsc_path,
extracted in extracted_path.
Args:
dsc_path: the path to the package's dsc file
extracted_path: the path where the package got extracted
log: a logging.Logger object
Returns: a dict with the following keys
history: list of (package_name, package_version) tuples parsed from
the package changelog
source_files: information about all the files in the source package
"""
ret = {}
changelog_path = os.path.join(extracted_path, b'debian/changelog')
with open(changelog_path, 'rb') as changelog:
try:
parsed_changelog = Changelog(changelog)
except UnicodeDecodeError:
if log:
log.warn('Unknown encoding for changelog %s,'
' falling back to iso' %
- changelog_path.decode('utf-8'))
+ changelog_path.decode('utf-8'), extra={
+ 'swh_type': 'deb_changelog_encoding',
+ 'swh_changelog': changelog_path.decode('utf-8'),
+ })
# need to reset as Changelog scrolls to the end of the file
changelog.seek(0)
parsed_changelog = Changelog(changelog, encoding='iso-8859-15')
ret['history'] = [(block.package, str(block.version))
for block in parsed_changelog]
ret.update(converters.uid_to_person(parsed_changelog.author, 'author'))
date = parse_date(parsed_changelog.date)
ret['author_date'] = date
ret['author_offset'] = date.tzinfo.utcoffset(date).seconds // 60
with open(dsc_path, 'rb') as dsc:
parsed_dsc = Dsc(dsc)
source_files = [get_file_info(dsc_path)]
dsc_dir = os.path.dirname(dsc_path)
for file in parsed_dsc['files']:
file_path = os.path.join(dsc_dir, file['name'])
file_info = get_file_info(file_path)
source_files.append(file_info)
ret['source_files'] = source_files
gpg_info = parsed_dsc.get_gpg_info(keyrings=DEBIAN_KEYRINGS)
ret.update(get_gpg_info_signature(gpg_info))
return ret
def process_source_package(package, log=None):
"""Process a source package into its constituent components.
The source package will be decompressed in a temporary directory.
Args:
package: a dict with the following keys:
name: source package name
version: source package version
dsc: the full path of the package's DSC file.
Returns:
A tuple with two elements:
package: the original package dict augmented with the following keys:
metadata: the metadata from get_package_metadata
basedir: the temporary directory in which the package was
decompressed
directory: the sha1_git of the root directory of the package
objects: a dictionary of the parsed directories and files, both indexed
by id
Raises:
- FileNotFoundError if the dsc file does not exist.
"""
if not os.path.exists(package['dsc']):
raise FileNotFoundError('%s does not exist' % package['dsc'])
basedir = tempfile.mkdtemp()
debdir = os.path.join(basedir, '%s_%s' % (package['name'],
package['version']))
# the swh.loader.dir internals legitimately want bytes for paths
debdir = debdir.encode('utf-8')
extract_src_pkg(package['dsc'], debdir, log=log)
parsed_objects = walk_and_compute_sha1_from_directory(debdir)
root_tree = parsed_objects[ROOT_TREE_KEY][0]
package = package.copy()
package['basedir'] = basedir
package['directory'] = root_tree['sha1_git']
package['metadata'] = get_package_metadata(package['dsc'], debdir, log=log)
return package, converters.dedup_objects(parsed_objects)
def process_source_packages(packages, log=None):
"""Execute process_source_package, but for lists of packages.
Args:
packages: an iterable of packages as expected by process_source_package
log: a logging.Logger object
Returns: a generator of partial results.
Partial results have the following keys:
objects: the accumulator for merge_objects. This member can be mutated
to clear the pending contents and directories.
tempdirs: the temporary directories processed so far. This list can be
flushed if temporary directories are removed on the fly.
packages: the list of packages processed so far.
"""
objects = {
'content': {},
'directory': {},
'content_seen': set(),
'directory_seen': set(),
}
ret_packages = []
tempdirs = []
for package in packages:
ret_package, package_objs = process_source_package(package)
ret_packages.append(ret_package)
converters.merge_objects(objects, package_objs)
tempdirs.append(ret_package['basedir'])
yield {
'objects': objects,
'packages': ret_packages,
'tempdirs': tempdirs,
}
def flush_content(storage, partial_result, content_max_length_one, log=None):
"""Flush the contents from a partial_result to storage
Args:
storage: an instance of swh.storage.Storage
partial_result: a partial result as yielded by process_source_packages
content_max_length_one: the maximum length of a persisted content
log: a logging.Logger object
This function mutates partial_result to empty the content dict
"""
contents = partial_result['objects']['content']
missing_ids = storage.content_missing(contents.values(),
key_hash='sha1_git')
if missing_ids:
full_contents = (
converters.shallow_content_to_content(contents[i],
content_max_length_one)
for i in missing_ids)
storage.content_add(full_contents)
partial_result['objects']['content'] = {}
def flush_directory(storage, partial_result, log=None):
"""Flush the directories from a partial_result to storage
Args:
storage: an instance of swh.storage.Storage
partial_result: a partial result as yielded by process_source_packages
log: a logging.Logger object
This function mutates partial_result to empty the directory dict
"""
storage.directory_add(partial_result['objects']['directory'].values())
partial_result['objects']['directory'] = {}
def remove_tempdirs(partial_result, log=None):
"""Remove the temporary files for the packages listed"""
for tempdir in partial_result['tempdirs']:
if os.path.isdir(tempdir):
shutil.rmtree(tempdir)
# Use the slice trick to empty the list in-place
partial_result['tempdirs'][:] = []
def try_flush_partial(storage, partial_result,
content_packet_size=10000,
content_packet_length=1024 * 1024 * 1024 * 1024,
content_max_length_one=100 * 1024 * 1024,
directory_packet_size=25000, force=False, log=None):
"""Conditionally flush the partial result to storage.
Args:
storage: an instance of swh.storage.Storage
partial_result: a partial result as yielded by process_source_packages
content_packet_size: the number of contents that triggers a flush
content_packet_length: the cumulated size of contents that triggers a
flush
content_max_length_one: the maximum length of a persisted content
directory_packet_size: the number of directories that triggers a flush
force: force a flush regardless of packet sizes
log: a logging.Logger object
"""
n_content = len(partial_result['objects']['content'])
n_directory = len(partial_result['objects']['directory'])
if force:
# Don't compute the length if we don't care
len_contents = 0
else:
len_contents = sum(
content['length']
for content in partial_result['objects']['content'].values()
if content['length'] <= content_max_length_one
)
# Flush both contents and directories at once to be able to clear
# tempfiles while we work
if force or n_content >= content_packet_size or \
len_contents >= content_packet_length or \
n_directory >= directory_packet_size:
flush_content(storage, partial_result, content_max_length_one, log=log)
flush_directory(storage, partial_result, log=log)
remove_tempdirs(partial_result, log=log)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Aug 18, 10:04 PM (1 d, 13 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3294041
Attached To
rDLDDEB Debian package loader
Event Timeline
Log In to Comment