Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/metadata/github.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Metadata fetcher for GitHub.""" | """Metadata fetcher for GitHub.""" | ||||
import json | |||||
import re | import re | ||||
from typing import List, Optional, Tuple | from typing import List, Optional, Tuple | ||||
import urllib.parse | import urllib.parse | ||||
from swh.lister.github.utils import GitHubSession | from swh.lister.github.utils import GitHubSession | ||||
from swh.model.model import Origin | |||||
from . import USER_AGENT | from . import USER_AGENT | ||||
from .base import BaseMetadataFetcher, InvalidOrigin | from .base import BaseMetadataFetcher, InvalidOrigin | ||||
HTTP_ACCEPT = "application/vnd.github.v3+json" | HTTP_ACCEPT = "application/vnd.github.v3+json" | ||||
"""HTTP header sent on all API requests to GitHub.""" | """HTTP header sent on all API requests to GitHub.""" | ||||
# The format is defined by a well-understood MIME type; we might as well use that. | # The format is defined by a well-understood MIME type; we might as well use that. | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | def _get_origin_metadata_bytes(self) -> List[Tuple[str, bytes]]: | ||||
metadata_bytes = response.content | metadata_bytes = response.content | ||||
# TODO?: strip API hyperlinks from metadata_bytes to save space? | # TODO?: strip API hyperlinks from metadata_bytes to save space? | ||||
# They take 10KB for every repo, or 1KB when compressed by the database server. | # They take 10KB for every repo, or 1KB when compressed by the database server. | ||||
# This means processing metadata_bytes and changing the format, instead of | # This means processing metadata_bytes and changing the format, instead of | ||||
# archiving verbatim, though. | # archiving verbatim, though. | ||||
return [(METADATA_FORMAT, metadata_bytes)] | return [(METADATA_FORMAT, metadata_bytes)] | ||||
def get_parent_origins(self) -> List[Origin]: | |||||
parents = [] | |||||
for metadata in self.get_origin_metadata(): | |||||
if metadata.format != METADATA_FORMAT: | |||||
continue | |||||
data = json.loads(metadata.metadata) | |||||
parent = data.get("parent") | |||||
source = data.get("source") | |||||
if parent is not None: | |||||
olasd: Maybe we should build that from `clone_url` (stripping the .git ending if it's there) instead. | |||||
parents.append(Origin(url=parent["html_url"])) | |||||
if source is not None and source["html_url"] != parent["html_url"]: | |||||
parents.append(Origin(url=source["html_url"])) | |||||
return parents |
Maybe we should build that from clone_url (stripping the .git ending if it's there) instead. I guess it's consistent with the way the lister does it?