Changeset View
Standalone View
swh/loader/package/debian/loader.py
Show First 20 Lines • Show All 83 Lines • ▼ Show 20 Lines | class DebianLoader(PackageLoader): | ||||
def get_package_info(self, version: str) -> Generator[ | def get_package_info(self, version: str) -> Generator[ | ||||
Tuple[str, Mapping[str, Any]], None, None]: | Tuple[str, Mapping[str, Any]], None, None]: | ||||
meta = self.packages[version] | meta = self.packages[version] | ||||
p_info = meta.copy() | p_info = meta.copy() | ||||
p_info['raw'] = meta | p_info['raw'] = meta | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def resolve_revision_from( | def resolve_revision_from( | ||||
self, known_package_artifacts: Dict, artifact_metadata: Dict) \ | self, known_package_artifacts: Mapping, | ||||
artifact_metadata: Mapping) \ | |||||
-> Optional[bytes]: | -> Optional[bytes]: | ||||
artifacts_to_fetch = artifact_metadata['files'] | return resolve_revision_from( | ||||
ardumont: docstring is in the main class [1]
i do not know how we can deal with that properly without… | |||||
logger.debug('k_p_artifacts: %s', known_package_artifacts) | known_package_artifacts, artifact_metadata) | ||||
logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch) | |||||
for rev_id, known_artifacts in known_package_artifacts.items(): | |||||
logger.debug('Revision: %s', rev_id) | |||||
logger.debug('Associated known_artifacts: %s', known_artifacts) | |||||
known_artifacts = known_artifacts['extrinsic']['raw']['files'] | |||||
rev_found = True | |||||
Done Inline Actionsthis i found backward... ardumont: this i found backward... | |||||
for a_name, k_artifact in known_artifacts.items(): | |||||
artifact_to_fetch = artifacts_to_fetch.get(a_name) | |||||
logger.debug('artifact_to_fetch: %s', artifact_to_fetch) | |||||
if artifact_to_fetch is None: | |||||
# as soon as we do not see an artifact, we consider we need | |||||
# to check the other revision | |||||
rev_found = False | |||||
if k_artifact['sha256'] != artifact_to_fetch['sha256']: | |||||
# Hash is different, we consider we need to check the other | |||||
# revisions | |||||
rev_found = False | |||||
if rev_found: | |||||
logger.debug('Existing revision %s found for new artifacts.', | |||||
rev_id) | |||||
return rev_id | |||||
logger.debug('No existing revision found for the new artifacts.') | |||||
return None | |||||
def download_package(self, p_info: Mapping[str, Any], | def download_package(self, p_info: Mapping[str, Any], | ||||
tmpdir: str) -> List[Tuple[str, Mapping]]: | tmpdir: str) -> List[Tuple[str, Mapping]]: | ||||
"""Contrary to other package loaders (1 package, 1 artifact), | """Contrary to other package loaders (1 package, 1 artifact), | ||||
Not Done Inline Actionsthese lines are backwards olasd: these lines are backwards | |||||
`a_metadata` represents the package's datafiles set to fetch: | `a_metadata` represents the package's datafiles set to fetch: | ||||
- <package-version>.orig.tar.gz | - <package-version>.orig.tar.gz | ||||
- <package-version>.dsc | - <package-version>.dsc | ||||
- <package-version>.diff.gz | - <package-version>.diff.gz | ||||
This is delegated to the `download_package` function. | This is delegated to the `download_package` function. | ||||
""" | """ | ||||
all_hashes = download_package(p_info, tmpdir) | all_hashes = download_package(p_info, tmpdir) | ||||
logger.debug('all_hashes: %s', all_hashes) | logger.debug('all_hashes: %s', all_hashes) | ||||
res = [] | res = [] | ||||
for hashes in all_hashes.values(): | for hashes in all_hashes.values(): | ||||
res.append((tmpdir, hashes)) | res.append((tmpdir, hashes)) | ||||
Not Done Inline ActionsIt looks like what we want is to match the artifacts of the current package with all packages referenced in the previous snapshot in turn. I _think_ that the logic in this loop is inverted, but I have a hard time comprehending it. The comments are backwards too :( The logic would be much clearer if we did the comparison using set logic:
We could even make these sets ordered by filename, which lets us turn them into (hashable) tuples, which lets us build a dict mapping ((filename, (hash, hash, size)), ...) to revision_ids directly when fetching the previous snapshot, making this function a plain dict lookup. olasd: It looks like what we want is to match the artifacts of the current package with all packages… | |||||
logger.debug('res: %s', res) | logger.debug('res: %s', res) | ||||
return res | return res | ||||
def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], | def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], | ||||
dest: str) -> str: | dest: str) -> str: | ||||
logger.debug('dl_artifacts: %s', dl_artifacts) | logger.debug('dl_artifacts: %s', dl_artifacts) | ||||
return extract_package(dl_artifacts, dest=dest) | return extract_package(dl_artifacts, dest=dest) | ||||
Not Done Inline ActionsThis try block feels overly broad. Could you just do if 'files' not in artifact_metadata: logger.warning(...) return None # or even artifact_metadata.get('revision_id') at the beginning of the function instead? olasd: This try block feels overly broad.
Could you just do
```lang=python
if 'files' not in… | |||||
Done Inline Actionsthat's not enough that can explode a bit later. ardumont: that's not enough
that can explode a bit later. | |||||
Not Done Inline ActionsI agree with @olasd that this try block is too long. There's a high risk of catching unrelated error. vlorentz: I agree with @olasd that this try block is too long. There's a high risk of catching unrelated… | |||||
Done Inline Actionsi agree the try is long. So it should fail only for that. ardumont: i agree the try is long.
But here, all this shenanigans is to solve a revision from metadata… | |||||
Done Inline Actions
as in: 8d073f5dc6c8b7a322618c35'}}, 'version': '0.3.9-2', 'revision_id': None}} Dec 19 14:58:02 worker01 python3[557]: [2019-12-19 14:58:02,443: ERROR/ForkPoolWorker-1] Fail to load deb://Debian/packages/freshplayerplugin Traceback (most recent call last): File "/usr/lib/python3/dist-packages/swh/loader/package/loader.py", line 286, in load known_artifacts, p_info['raw']) File "/usr/lib/python3/dist-packages/swh/loader/package/debian/loader.py", line 104, in resolve_revision_from known_artifacts = known_artifacts['extrinsic']['raw']['files'] KeyError: 'extrinsic' ardumont: > that can explode a bit later
as in:
```
8d073f5dc6c8b7a322618c35'}}, 'version': '0.3.9-2'… | |||||
Not Done Inline ActionsOne more argument for parsing these only once when fetching the old snapshot (see my other inline comment). ;) olasd: One more argument for parsing these only once when fetching the old snapshot (see my other… | |||||
def build_revision(self, a_metadata: Mapping[str, Any], | def build_revision(self, a_metadata: Mapping[str, Any], | ||||
uncompressed_path: str) -> Dict: | uncompressed_path: str) -> Dict: | ||||
dsc_url, dsc_name = dsc_information(a_metadata) | dsc_url, dsc_name = dsc_information(a_metadata) | ||||
if not dsc_name: | if not dsc_name: | ||||
raise ValueError( | raise ValueError( | ||||
'dsc name for url %s should not be None' % dsc_url) | 'dsc name for url %s should not be None' % dsc_url) | ||||
dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) | dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) | ||||
i_metadata = get_package_metadata( | i_metadata = get_package_metadata( | ||||
Show All 26 Lines | def build_revision(self, a_metadata: Mapping[str, Any], | ||||
'provider': dsc_url, | 'provider': dsc_url, | ||||
'when': self.visit_date.isoformat(), | 'when': self.visit_date.isoformat(), | ||||
'raw': a_metadata, | 'raw': a_metadata, | ||||
}, | }, | ||||
} | } | ||||
} | } | ||||
def resolve_revision_from(known_package_artifacts: Mapping, | |||||
artifact_metadata: Mapping) -> Optional[bytes]: | |||||
"""Given known package artifacts (resolved from the snapshot of previous | |||||
visit) and the new artifact to fetch, try to solve the corresponding | |||||
revision. | |||||
""" | |||||
artifacts_to_fetch = artifact_metadata.get('files') | |||||
if not artifacts_to_fetch: | |||||
return None | |||||
logger.debug('k_p_artifacts: %s', known_package_artifacts) | |||||
logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch) | |||||
for rev_id, known_artifacts in known_package_artifacts.items(): | |||||
logger.debug('Revision: %s', rev_id) | |||||
logger.debug('Associated known_artifacts: %s', known_artifacts) | |||||
known_artifacts = known_artifacts['extrinsic']['raw']['files'] | |||||
rev_found = True | |||||
for a_name, k_artifact in known_artifacts.items(): | |||||
artifact_to_fetch = artifacts_to_fetch.get(a_name) | |||||
logger.debug('artifact_to_fetch: %s', artifact_to_fetch) | |||||
if artifact_to_fetch is None: | |||||
# as soon as we do not see an artifact, we consider we need | |||||
# to check the other revision | |||||
rev_found = False | |||||
if k_artifact['sha256'] != artifact_to_fetch['sha256']: | |||||
# Hash is different, we consider we need to check the other | |||||
# revisions | |||||
rev_found = False | |||||
if rev_found: | |||||
logger.debug('Existing revision %s found for new artifacts.', | |||||
rev_id) | |||||
return rev_id | |||||
logger.debug('No existing revision found for the new artifacts.') | |||||
return None | |||||
def uid_to_person(uid: str) -> Mapping[str, str]: | def uid_to_person(uid: str) -> Mapping[str, str]: | ||||
"""Convert an uid to a person suitable for insertion. | """Convert an uid to a person suitable for insertion. | ||||
Args: | Args: | ||||
uid: an uid of the form "Name <email@ddress>" | uid: an uid of the form "Name <email@ddress>" | ||||
Returns: | Returns: | ||||
a dictionary with the following keys: | a dictionary with the following keys: | ||||
▲ Show 20 Lines • Show All 201 Lines • Show Last 20 Lines |
docstring is in the main class [1]
i do not know how we can deal with that properly without replicating stuff.
[1] https://forge.softwareheritage.org/source/swh-loader-core/browse/master/swh/loader/package/loader.py$0-158