Changeset View
Standalone View
swh/lister/json/lister.py
- This file was added.
# Copyright (C) 2019 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import json | |||||
import requests | |||||
from .models import JSONModel | |||||
from swh.scheduler import utils | |||||
from swh.lister.core.simple_lister import SimpleLister | |||||
class JSONLister(SimpleLister): | |||||
ardumont: Something more specific will be needed.
JSON is too generic and wrong as other apis can also… | |||||
MODEL = JSONModel | |||||
LISTER_NAME = 'json' | |||||
instance = 'json' | |||||
def __init__(self, url, override_config=None): | |||||
self.url = url | |||||
super().__init__(override_config=override_config) | |||||
def task_dict(self, origin_type, origin_url, **kwargs): | |||||
""" | |||||
Return task format dict | |||||
This is overridden from the lister_base as more information is | |||||
needed for the ingestion task creation. | |||||
""" | |||||
return utils.create_task_dict( | |||||
'load-tar', kwargs.get('policy', 'oneshot'), | |||||
kwargs.get('name'), | |||||
ardumontUnsubmitted Not Done Inline ActionsYou could drop the package's name i guess. ardumont: You could drop the package's name i guess. | |||||
origin_url, tarballs=[{'archive': origin_url, 'date': 0}]) | |||||
ardumontUnsubmitted Not Done Inline ActionsI guess we could:
About the integrity field, i guess we can split it to explicitely name it with its hash... packages = build_packages(...) # < to clear things up a bit # where packages is of the form: # {{'uri': origin_url, 'date': <some-date-isoformat>, "sha256": "MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs="}] return utils.create_task_dict( 'load-tar', kwargs.get('policy', 'oneshot'), origin=origin_url, packages=packages) @douardda ^ what do you think? ardumont: I guess we could:
- drop the package's name (it's mostly unused in other listers and i'm… | |||||
ardumontUnsubmitted Not Done Inline ActionsIf we could have the version of the package also (within the packages's entries), that'd be awesome. ardumont: If we could have the `version` of the package also (within the `packages`'s entries), that'd be… | |||||
lewoAuthorUnsubmitted Done Inline ActionsUnfortunately, it's difficult to get the version. packages = [ hello = [ name = "hello" version = "1.0" buildRecipe = "make" src = { url = "http://gnu/hello-1.0.tgz" sha = "bla" } ] So, the src attribute is not versioned. The version is on the package level. lewo: Unfortunately, it's difficult to get the version.
In nixpkgs, we basically have this kind of… | |||||
ardumontUnsubmitted Not Done Inline ActionsRight. Nonetheless, we could have that lister parse the version from what's provided (here the url then). Developing the loader tar (D2145) raises interesting questions about the gnu loader [1] and the new one. To develop further, i guess we need some more dataset sample though ;) ardumont: Right.
Nonetheless, we could have that lister parse the version from what's provided (here the… | |||||
ardumontUnsubmitted Not Done Inline ActionsAlso for the hash, i mean the base64 decoded value as ascii string... so packages really becomes: {{'uri': origin_url, 'date': <some-date-isoformat>, "sha256": "31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b"}] That unifies with other existing lister output and loader expectations. The following will help: from typing import Tuple def integrity_to_hash(integrity_value: str) -> Tuple[str, str]: """Parse an integrity field into a field (hash_name, hash_hex) [1] [1] https://www.w3.org/TR/SRI """ hash_name, base64_value = integrity_value.split('-') from base64 import b64decode from binascii import hexlify hash_hex = hexlify(b64decode(base64_value)).decode('utf-8') return hash_name, hash_hex def test_integrity_to_hash(): """Parsing an integrity field hash should return a tuple hash_name, hash_hex strings """ actual_hash_name, actual_hash_hex = integrity_to_hash( 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=') assert hash_name == 'sha256' assert hash_hex == '31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b' # noqa ardumont: Also for the hash, i mean the base64 decoded value as ascii string...
so packages really… | |||||
def safely_issue_request(self, identifier): | |||||
''' | |||||
Make network request to download the JSON file. | |||||
Args: | |||||
identifier: resource identifier (unused) | |||||
Returns: | |||||
Server response | |||||
''' | |||||
response = requests.get(self.url, | |||||
allow_redirects=True) | |||||
# TODO: support gzip content as well | |||||
return json.loads(response.content.decode('utf-8')) | |||||
def list_packages(self, response): | |||||
"""List packages from the response | |||||
""" | |||||
return [r for r in response if r["source"]["type"] == "url"] | |||||
def get_model_from_repo(self, source): | |||||
"""Transform from source representation to model | |||||
""" | |||||
# The integrity attribute is a hash of the content in the SRI format | |||||
# See https://www.w3.org/TR/SRI | |||||
if 'integrity' in source['source']: | |||||
uid = source['source']['integrity'] | |||||
else: | |||||
uid = source['source']['url'] | |||||
return { | |||||
'uid': uid, | |||||
'name': source['name'], | |||||
'full_name': source['name'], | |||||
'html_url': source['source']['url'], | |||||
'origin_url': source['source']['url'], | |||||
'origin_type': 'tar', | |||||
ardumontUnsubmitted Not Done Inline ActionsFrom the mailing list discussion, i recall it's not only tar origins we could list. ardumont: From the mailing list discussion, i recall it's not only tar origins we could list. | |||||
} | |||||
def transport_response_simplified(self, response): | |||||
ardumontUnsubmitted Done Inline ActionsAfter rebasing, this can go away now (base implementation is implemented that way). ardumont: After rebasing, this can go away now (base implementation is implemented that way). | |||||
"""Transform response to list for model manipulation | |||||
""" | |||||
return [self.get_model_from_repo(repo) for repo in response] |
Something more specific will be needed.
JSON is too generic and wrong as other apis can also list json.
FunctionalPackageManagerLister?
FunctionalPackageLister?
In my head, only guix and nix qualifies for it so far (and they will have something sufficiently near IIUC).