Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/metadata/github.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Metadata fetcher for GitHub.""" | """Metadata fetcher for GitHub.""" | ||||
import json | |||||
import re | import re | ||||
from typing import List, Optional, Tuple | from typing import List, Optional, Tuple | ||||
import urllib.parse | import urllib.parse | ||||
from swh.lister.github.utils import GitHubSession | from swh.lister.github.utils import GitHubSession | ||||
from swh.model.model import Origin | |||||
from . import USER_AGENT | from . import USER_AGENT | ||||
from .base import BaseMetadataFetcher, InvalidOrigin | from .base import BaseMetadataFetcher, InvalidOrigin | ||||
HTTP_ACCEPT = "application/vnd.github.v3+json" | HTTP_ACCEPT = "application/vnd.github.v3+json" | ||||
"""HTTP header sent on all API requests to GitHub.""" | """HTTP header sent on all API requests to GitHub.""" | ||||
# The format is defined by a well-understood MIME type; we might as well use that. | # The format is defined by a well-understood MIME type; we might as well use that. | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | def _get_origin_metadata_bytes(self) -> List[Tuple[str, bytes]]: | ||||
metadata_bytes = response.content | metadata_bytes = response.content | ||||
# TODO?: strip API hyperlinks from metadata_bytes to save space? | # TODO?: strip API hyperlinks from metadata_bytes to save space? | ||||
# They take 10KB for every repo, or 1KB when compressed by the database server. | # They take 10KB for every repo, or 1KB when compressed by the database server. | ||||
# This means processing metadata_bytes and changing the format, instead of | # This means processing metadata_bytes and changing the format, instead of | ||||
# archiving verbatim, though. | # archiving verbatim, though. | ||||
return [(METADATA_FORMAT, metadata_bytes)] | return [(METADATA_FORMAT, metadata_bytes)] | ||||
def get_parent_origin(self) -> Optional[Origin]: | |||||
for metadata in self.get_origin_metadata(): | |||||
if metadata.format != METADATA_FORMAT: | |||||
continue | |||||
parent = json.loads(metadata.metadata).get("parent") | |||||
if parent is None: | |||||
continue | |||||
url = urllib.parse.urlunsplit( | |||||
("https", "github.com", parent["full_name"], "", "") | |||||
) | |||||
olasd: Maybe we should build that from `clone_url` (stripping the .git ending if it's there) instead. | |||||
return Origin(url=url) | |||||
return None |
Maybe we should build that from clone_url (stripping the .git ending if it's there) instead. I guess it's consistent with the way the lister does it?