diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index 563fa18..c281f22 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,482 +1,488 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging from pathlib import Path import re import tarfile import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import unquote, urljoin from bs4 import BeautifulSoup from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. ArchListerPage = List[Dict[str, Any]] def size_to_bytes(size: str) -> int: """Convert human readable file size to bytes. Resulting value is an approximation as input value is in most case rounded. Args: size: A string representing a human readable file size (eg: '500K') Returns: A decimal representation of file size Examples:: >>> size_to_bytes("500") 500 >>> size_to_bytes("1K") 1000 """ units = { "K": 1000, "M": 1000**2, "G": 1000**3, "T": 1000**4, "P": 1000**5, "E": 1000**6, "Z": 1000**7, "Y": 1000**8, } if size.endswith(tuple(units)): v, u = (size[:-1], size[-1]) return int(v) * units[u] else: return int(size) class ArchLister(StatelessLister[ArchListerPage]): """List Arch linux origins from 'core', 'extra', and 'community' repositories For 'official' Arch Linux it downloads core.tar.gz, extra.tar.gz and community.tar.gz from https://archive.archlinux.org/repos/last/ extract to a temp directory and then walks through each 'desc' files. Each 'desc' file describe the latest released version of a package and helps to build an origin url from where scrapping artifacts metadata. For 'arm' Arch Linux it follow the same discovery process parsing 'desc' files. The main difference is that we can't get existing versions of an arm package because https://archlinuxarm.org does not have an 'archive' website or api. """ LISTER_NAME = "arch" VISIT_TYPE = "arch" INSTANCE = "arch" ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( "{base_url}/packages/{pkgname[0]}/{pkgname}/{filename}" ) ARCH_API_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}/json" ARM_PACKAGE_URL_PATTERN = "{base_url}/packages/{arch}/{pkgname}" ARM_PACKAGE_DOWNLOAD_URL_PATTERN = "{base_url}/{arch}/{repo}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, flavours: Dict[str, Any] = { "official": { "archs": ["x86_64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinux.org", "base_archive_url": "https://archive.archlinux.org", "base_mirror_url": "", "base_api_url": "https://archlinux.org", }, "arm": { "archs": ["armv7h", "aarch64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinuxarm.org", "base_archive_url": "", "base_mirror_url": "https://uk.mirror.archlinuxarm.org", "base_api_url": "", }, }, ): super().__init__( scheduler=scheduler, credentials=credentials, url=flavours["official"]["base_info_url"], instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.flavours = flavours def scrap_package_versions( self, name: str, repo: str, base_url: str ) -> List[Dict[str, Any]]: """Given a package 'name' and 'repo', make an http call to origin url and parse its content to get package versions artifacts data. That method is suitable only for 'official' Arch Linux, not 'arm'. Args: name: Package name repo: The repository the package belongs to (one of self.repos) Returns: A list of dict of version Example:: [ {"url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 "arch": "x86_64", "repo": "core", "name": "dialog", "version": "1:1.3_20190211-1", "length": 180000, "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", "last_modified": "2019-02-13T08:36:00"}, ] """ url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( pkgname=name, base_url=base_url ) response = self.http_request(url) soup = BeautifulSoup(response.text, "html.parser") links = soup.find_all("a", href=True) # drop the first line (used to go to up directory) if links[0].attrs["href"] == "../": links.pop(0) versions = [] for link in links: # filename displayed can be cropped if name is too long, get it from href instead filename = unquote(link.attrs["href"]) if filename.endswith((".tar.xz", ".tar.zst")): # Extract arch from filename arch_rex = re.compile( rf"^{re.escape(name)}-(?P.*)-(?Pany|i686|x86_64)" rf"(.pkg.tar.(?:zst|xz))$" ) m = arch_rex.match(filename) if m is None: logger.error( "Can not find a match for architecture in %(filename)s", dict(filename=filename), ) else: arch = m.group("arch") version = m.group("version") # Extract last_modified and an approximate file size raw_text = link.next_sibling raw_text_rex = re.compile( r"^(?P\d+-\w+-\d+ \d\d:\d\d)\s+(?P\w+)$" ) s = raw_text_rex.search(raw_text.strip()) if s is None: logger.error( "Can not find a match for 'last_modified' and/or " "'size' in '%(raw_text)s'", dict(raw_text=raw_text), ) else: assert s.groups() assert len(s.groups()) == 2 last_modified_str, size = s.groups() # format as expected last_modified = datetime.datetime.strptime( last_modified_str, "%d-%b-%Y %H:%M" ).isoformat() length = size_to_bytes(size) # we want bytes # link url is relative, format a canonical one url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format( base_url=base_url, pkgname=name, filename=filename ) versions.append( dict( name=name, version=version, repo=repo, arch=arch, filename=filename, url=url, last_modified=last_modified, length=length, ) ) return versions def get_repo_archive(self, url: str, destination_path: Path) -> Path: """Given an url and a destination path, retrieve and extract .tar.gz archive which contains 'desc' file for each package. Each .tar.gz archive corresponds to an Arch Linux repo ('core', 'extra', 'community'). Args: url: url of the .tar.gz archive to download destination_path: the path on disk where to extract archive Returns: a directory Path where the archive has been extracted to. """ res = self.http_request(url) destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(res.content) extract_to = Path(str(destination_path).split(".tar.gz")[0]) tar = tarfile.open(destination_path) tar.extractall(path=extract_to) tar.close() return extract_to def parse_desc_file( self, path: Path, repo: str, base_url: str, dl_url_fmt: str, ) -> Dict[str, Any]: """Extract package information from a 'desc' file. There are subtle differences between parsing 'official' and 'arm' des files Args: path: A path to a 'desc' file on disk repo: The repo the package belongs to Returns: A dict of metadata Example:: {'api_url': 'https://archlinux.org/packages/core/x86_64/dialog/json', 'arch': 'x86_64', 'base': 'dialog', 'builddate': '1650081535', 'csize': '203028', 'desc': 'A tool to display dialog boxes from shell scripts', 'filename': 'dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', 'isize': '483988', 'license': 'LGPL2.1', 'md5sum': '06407c0cb11c50d7bf83d600f2e8107c', 'name': 'dialog', 'packager': 'Evangelos Foutras ', 'pgpsig': 'pgpsig content xxx', 'project_url': 'https://invisible-island.net/dialog/', 'provides': 'libdialog.so=15-64', 'repo': 'core', 'sha256sum': 'ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7', 'url': 'https://archive.archlinux.org/packages/.all/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', # noqa: B950 'version': '1:1.3_20220414-1'} """ rex = re.compile(r"^\%(?P\w+)\%\n(?P.*)\n$", re.M) with path.open("rb") as content: parsed = rex.findall(content.read().decode()) data = {entry[0].lower(): entry[1] for entry in parsed} if "url" in data.keys(): data["project_url"] = data["url"] assert data["name"] assert data["filename"] assert data["arch"] data["repo"] = repo data["url"] = urljoin( base_url, dl_url_fmt.format( base_url=base_url, pkgname=data["name"], filename=data["filename"], arch=data["arch"], repo=repo, ), ) assert data["md5sum"] assert data["sha256sum"] data["checksums"] = { "md5sum": hash_to_hex(data["md5sum"]), "sha256sum": hash_to_hex(data["sha256sum"]), } return data def get_pages(self) -> Iterator[ArchListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of package belonging to a flavour ('official', 'arm'), and a repo ('core', 'extra', 'community') """ for name, flavour in self.flavours.items(): for arch in flavour["archs"]: for repo in flavour["repos"]: yield self._get_repo_page(name, flavour, arch, repo) def _get_repo_page( self, name: str, flavour: Dict[str, Any], arch: str, repo: str ) -> ArchListerPage: with tempfile.TemporaryDirectory() as tmpdir: page = [] if name == "official": prefix = urljoin(flavour["base_archive_url"], "/repos/last/") filename = f"{repo}.files.tar.gz" archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") destination_path = Path(tmpdir, arch, filename) base_url = flavour["base_archive_url"] dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN base_info_url = flavour["base_info_url"] info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN elif name == "arm": filename = f"{repo}.files.tar.gz" archive_url = urljoin( flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" ) destination_path = Path(tmpdir, arch, filename) base_url = flavour["base_mirror_url"] dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN base_info_url = flavour["base_info_url"] info_url_fmt = self.ARM_PACKAGE_URL_PATTERN archive = self.get_repo_archive( url=archive_url, destination_path=destination_path ) assert archive packages_desc = list(archive.glob("**/desc")) logger.debug( "Processing %(instance)s source packages info from " "%(flavour)s %(arch)s %(repo)s repository, " "(%(qty)s packages).", dict( instance=self.instance, flavour=name, arch=arch, repo=repo, qty=len(packages_desc), ), ) for package_desc in packages_desc: data = self.parse_desc_file( path=package_desc, repo=repo, base_url=base_url, dl_url_fmt=dl_url_fmt, ) assert data["builddate"] last_modified = datetime.datetime.fromtimestamp( float(data["builddate"]), tz=datetime.timezone.utc ) assert data["name"] assert data["filename"] assert data["arch"] url = info_url_fmt.format( base_url=base_info_url, pkgname=data["name"], filename=data["filename"], repo=repo, arch=data["arch"], ) assert data["version"] if name == "official": # find all versions of a package scrapping archive versions = self.scrap_package_versions( name=data["name"], repo=repo, base_url=base_url ) elif name == "arm": # There is no way to get related versions of a package, # but 'data' represents the latest released version, # use it in this case assert data["builddate"] assert data["csize"] assert data["url"] versions = [ dict( name=data["name"], version=data["version"], repo=repo, arch=data["arch"], filename=data["filename"], url=data["url"], last_modified=last_modified.replace(tzinfo=None).isoformat( timespec="seconds" ), length=int(data["csize"]), ) ] package = { "name": data["name"], "version": data["version"], "last_modified": last_modified, "url": url, "versions": versions, "data": data, } page.append(package) return page def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: """Iterate on all arch pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for origin in page: artifacts = [] arch_metadata = [] for version in origin["versions"]: artifacts.append( { "version": version["version"], "filename": version["filename"], "url": version["url"], "length": version["length"], } ) if version["version"] == origin["version"]: artifacts[-1]["checksums"] = { "md5": origin["data"]["md5sum"], "sha256": origin["data"]["sha256sum"], } else: artifacts[-1]["checksums"] = {"length": version["length"]} arch_metadata.append( { "version": version["version"], "name": version["name"], "arch": version["arch"], "repo": version["repo"], "last_modified": version["last_modified"], } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=origin["last_modified"], extra_loader_arguments={ "artifacts": artifacts, "arch_metadata": arch_metadata, }, ) diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py index 9bbdf37..dc43d7d 100644 --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -1,152 +1,158 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging from typing import Any, Dict, Iterator, List, Optional from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. AurListerPage = Dict[str, Any] class AurLister(StatelessLister[AurListerPage]): """List Arch User Repository (AUR) origins. Given an url (used as a base url, default is 'https://aur.archlinux.org'), download a 'packages-meta-v1.json.gz' which contains a json file listing all existing packages definitions. Each entry describes the latest released version of a package. The origin url for a package is built using 'pkgname' and corresponds to a git repository. An rpc api exists but it is recommended to save bandwidth so it's not used. See https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html for more on this. """ LISTER_NAME = "aur" VISIT_TYPE = "aur" INSTANCE = "aur" BASE_URL = "https://aur.archlinux.org" DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" ORIGIN_URL_PATTERN = "{base_url}/packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def download_packages_index(self) -> List[Dict[str, Any]]: """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, and download the archive to self.DESTINATION_PATH Returns: a directory Path where the archive has been downloaded to. """ url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) return self.http_request(url).json() def get_pages(self) -> Iterator[AurListerPage]: """Yield an iterator which returns 'page' Each page corresponds to a package with a 'version', an 'url' for a Git repository, a 'project_url' which represents the upstream project url and a canonical 'snapshot_url' from which a tar.gz archive of the package can be downloaded. """ packages = self.download_packages_index() logger.debug("Found %s AUR packages in aur_index", len(packages)) for package in packages: # Exclude lines where Name differs from PackageBase as they represents # split package and they don't have resolvable snapshots url if package["Name"] == package["PackageBase"]: logger.debug("Processing AUR package %s", package["Name"]) pkgname = package["PackageBase"] version = package["Version"] project_url = package["URL"] last_modified = datetime.datetime.fromtimestamp( float(package["LastModified"]), tz=datetime.timezone.utc ).isoformat() yield { "pkgname": pkgname, "version": version, "url": self.ORIGIN_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "git_url": self.PACKAGE_VCS_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "project_url": project_url, "last_modified": last_modified, } def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances. It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` entries to 'extra_loader_arguments'. `artifacts` describe the file to download and `aur_metadata` store some metadata that can be useful for the loader. """ assert self.lister_obj.id is not None last_update = datetime.datetime.fromisoformat(origin["last_modified"]) filename = origin["snapshot_url"].split("/")[-1] artifacts = [ { "filename": filename, "url": origin["snapshot_url"], "version": origin["version"], } ] aur_metadata = [ { "version": origin["version"], "project_url": origin["project_url"], "last_update": origin["last_modified"], "pkgname": origin["pkgname"], } ] yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, "aur_metadata": aur_metadata, }, ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="git", url=origin["git_url"], last_update=last_update, ) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 7bcec03..05720c9 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,173 +1,179 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib import parse import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class BitbucketListerState: """State of Bitbucket lister""" last_repo_cdate: Optional[datetime] = None """Creation date and time of the last listed repository during an incremental pass""" class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): """List origins from Bitbucket using its API. Bitbucket API has the following rate-limit configuration: * 60 requests per hour for anonymous users * 1000 requests per hour for authenticated users The lister is working in anonymous mode by default but Bitbucket account credentials can be provided to perform authenticated requests. """ LISTER_NAME = "bitbucket" INSTANCE = "bitbucket" API_URL = "https://api.bitbucket.org/2.0/repositories" def __init__( self, scheduler: SchedulerInterface, page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.url_params: Dict[str, Any] = { "pagelen": page_size, # only return needed JSON fields in bitbucket API responses # (also prevent errors 500 when listing) "fields": ( "next,values.links.clone.href,values.scm,values.updated_on," "values.created_on" ), } self.session.headers.update({"Accept": "application/json"}) if len(self.credentials) > 0: cred = random.choice(self.credentials) logger.warning("Using Bitbucket credentials from user %s", cred["username"]) self.set_credentials(cred["username"], cred["password"]) else: logger.warning("No credentials set in configuration, using anonymous mode") def state_from_dict(self, d: Dict[str, Any]) -> BitbucketListerState: last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = iso8601.parse_date(last_repo_cdate) return BitbucketListerState(**d) def state_to_dict(self, state: BitbucketListerState) -> Dict[str, Any]: d = asdict(state) last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = last_repo_cdate.isoformat() return d def set_credentials(self, username: Optional[str], password: Optional[str]) -> None: """Set basic authentication headers with given credentials.""" if username is not None and password is not None: self.session.auth = (username, password) def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_repo_cdate: str = "1970-01-01" if ( self.incremental and self.state is not None and self.state.last_repo_cdate is not None ): last_repo_cdate = self.state.last_repo_cdate.isoformat() while True: self.url_params["after"] = last_repo_cdate body = self.http_request(self.url, params=self.url_params).json() yield body["values"] next_page_url = body.get("next") if next_page_url is not None: next_page_url = parse.urlparse(next_page_url) if not next_page_url.query: logger.warning("Failed to parse url %s", next_page_url) break last_repo_cdate = parse.parse_qs(next_page_url.query)["after"][0] else: # last page break def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of Bitbucket repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in page: last_update = iso8601.parse_date(repo["updated_on"]) origin_url = repo["links"]["clone"][0]["href"] origin_type = repo["scm"] yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=origin_type, last_update=last_update, ) def commit_page(self, page: List[Dict[str, Any]]) -> None: """Update the currently stored state using the latest listed page.""" if self.incremental: last_repo = page[-1] last_repo_cdate = iso8601.parse_date(last_repo["created_on"]) if ( self.state.last_repo_cdate is None or last_repo_cdate > self.state.last_repo_cdate ): self.state.last_repo_cdate = last_repo_cdate def finalize(self) -> None: if self.incremental: scheduler_state = self.get_state_from_scheduler() if self.state.last_repo_cdate is None: return # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if ( scheduler_state.last_repo_cdate is None or self.state.last_repo_cdate > scheduler_state.last_repo_cdate ): self.updated = True diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index 5b488e4..cc440dc 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -1,64 +1,70 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Dict, Iterator, List, Optional from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. BowerListerPage = List[Dict[str, str]] class BowerLister(StatelessLister[BowerListerPage]): """List Bower (Javascript package manager) origins.""" LISTER_NAME = "bower" VISIT_TYPE = "git" # Bower origins url are Git repositories INSTANCE = "bower" API_URL = "https://registry.bower.io/packages" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) def get_pages(self) -> Iterator[BowerListerPage]: """Yield an iterator which returns 'page' It uses the api endpoint provided by `https://registry.bower.io/packages` to get a list of package names with an origin url that corresponds to Git repository. There is only one page that list all origins urls. """ response = self.http_request(self.url) yield response.json() def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for entry in page: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=entry["url"], last_update=None, ) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 49458d0..4a9aeab 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,225 +1,231 @@ # Copyright (C) 2019-2022 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from requests.exceptions import HTTPError from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) Repositories = List[Dict[str, Any]] class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. The lister currently defines 2 listing behaviors: - If the `base_git_url` is provided, the listed origin urls are computed out of the base git url link and the one listed in the main listed page (resulting in less HTTP queries than the 2nd behavior below). This is expected to be the main deployed behavior. - Otherwise (with no `base_git_url`), for each found git repository listed, one extra HTTP query is made at the given url found in the main listing page to gather published "Clone" URLs to be used as origin URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback to the first one. """ LISTER_NAME = "cgit" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, base_git_url: Optional[str] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): """Lister class for CGit repositories. Args: url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. instance: Name of cgit instance. Defaults to url's network location if unset. base_git_url: Optional base git url which allows the origin url computations. """ super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/html"}) self.base_git_url = base_git_url def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.http_request(url) return BeautifulSoup(response.text, features="html.parser") def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server The last_update date is retrieved on the list of repo page to avoid to compute it on the repository details which only give a date per branch """ next_page: Optional[str] = self.url while next_page: bs_idx = self._get_and_parse(next_page) page_results = [] for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): repository_link = tr.find("a")["href"] repo_url = None git_url = None base_url = urljoin(self.url, repository_link).strip("/") if self.base_git_url: # mapping provided # computing git url git_url = base_url.replace(self.url, self.base_git_url) else: # we compute the git detailed page url from which we will retrieve # the git url (cf. self.get_origins_from_page) repo_url = base_url span = tr.find("span", {"class": re.compile("age-")}) last_updated_date = span.get("title") if span else None page_results.append( { "url": repo_url, "git_url": git_url, "last_updated_date": last_updated_date, } ) yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = repo["git_url"] or self._get_origin_from_repository_url( repo["url"] ) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=_parse_last_updated_date(repo), ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None # check if we are on the summary tab, if not, go to this tab tab = bs.find("table", {"class": "tabs"}) if tab: summary_a = tab.find("a", string="summary") if summary_a: summary_url = urljoin(repository_url, summary_a["href"]).strip("/") if summary_url != repository_url: logger.debug( "%s : Active tab is not the summary, trying to load the summary page", repository_url, ) return self._get_origin_from_repository_url(summary_url) else: logger.debug("No summary tab found on %s", repository_url) # origin urls are listed on the repository page # TODO check if forcing https is better or not ? # # # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: logger.debug("No git urls found on %s", repository_url) return None # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] return origin_url def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: """Parse the last updated date""" date = repository.get("last_updated_date") if not date: return None parsed_date = None for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): try: parsed_date = datetime.strptime(date, date_format) # force UTC to avoid naive datetime if not parsed_date.tzinfo: parsed_date = parsed_date.replace(tzinfo=timezone.utc) break except Exception: pass if not parsed_date: logger.warning( "Could not parse %s last_updated date: %s", repository["url"], date, ) return parsed_date diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index ab0190f..4f5cb40 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -1,123 +1,129 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bz2 from collections import defaultdict import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]] class CondaLister(StatelessLister[CondaListerPage]): """List Conda (anaconda.com) origins.""" LISTER_NAME = "conda" VISIT_TYPE = "conda" INSTANCE = "conda" BASE_REPO_URL = "https://repo.anaconda.com/pkgs" REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2" ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}" ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_REPO_URL, channel: str = "", archs: List = [], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.channel: str = channel self.archs: List[str] = archs self.packages: Dict[str, Any] = defaultdict(dict) self.package_dates: Dict[str, Any] = defaultdict(list) def get_pages(self) -> Iterator[CondaListerPage]: """Yield an iterator which returns 'page'""" for arch in self.archs: repodata_url = self.REPO_URL_PATTERN.format( url=self.url, channel=self.channel, arch=arch ) response = self.http_request(url=repodata_url) packages: Dict[str, Any] = json.loads(bz2.decompress(response.content))[ "packages" ] yield (arch, packages) def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None arch, packages = page package_names = set() for filename, package_metadata in packages.items(): package_names.add(package_metadata["name"]) version_key = ( f"{arch}/{package_metadata['version']}-{package_metadata['build']}" ) artifact: Dict[str, Any] = { "filename": filename, "url": self.ARCHIVE_URL_PATTERN.format( url=self.url, channel=self.channel, filename=filename, arch=arch, ), "version": version_key, "checksums": {}, } for checksum in ("md5", "sha256"): if checksum in package_metadata: artifact["checksums"][checksum] = package_metadata[checksum] self.packages[package_metadata["name"]][version_key] = artifact package_date = None if "timestamp" in package_metadata: package_date = datetime.datetime.fromtimestamp( package_metadata["timestamp"] / 1e3, datetime.timezone.utc ) elif "date" in package_metadata: package_date = iso8601.parse_date(package_metadata["date"]) if package_date: artifact["date"] = package_date.isoformat() self.package_dates[package_metadata["name"]].append(package_date) for package_name in package_names: package_dates = self.package_dates[package_name] yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.ORIGIN_URL_PATTERN.format( channel=self.channel, pkgname=package_name ), last_update=max(package_dates, default=None), extra_loader_arguments={ "artifacts": list(self.packages[package_name].values()) }, ) diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 32f7479..80669eb 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -1,199 +1,205 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from datetime import datetime import logging from typing import Any, Dict, Iterator, List, Optional, Set, Union import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CpanListerPage = Set[str] def get_field_value(entry, field_name): """ Splits ``field_name`` on ``.``, and use it as path in the nested ``entry`` dictionary. If a value does not exist, returns None. >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}} >>> get_field_value(entry, "foo") 1 >>> get_field_value(entry, "bar") {'baz': 2, 'qux': [3]} >>> get_field_value(entry, "bar.baz") 2 >>> get_field_value(entry, "bar.qux") 3 """ fields = field_name.split(".") field_value = entry["_source"] for field in fields[:-1]: field_value = field_value.get(field, {}) field_value = field_value.get(fields[-1]) # scrolled results might have field value in a list if isinstance(field_value, list): field_value = field_value[0] return field_value def get_module_version( module_name: str, module_version: Union[str, float, int], release_name: str ) -> str: # some old versions fail to be parsed and cpan api set version to 0 if module_version == 0: prefix = f"{module_name}-" if release_name.startswith(prefix): # extract version from release name module_version = release_name.replace(prefix, "", 1) return str(module_version) class CpanLister(StatelessLister[CpanListerPage]): """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive Network.""" LISTER_NAME = "cpan" VISIT_TYPE = "cpan" INSTANCE = "cpan" API_BASE_URL = "https://fastapi.metacpan.org/v1" REQUIRED_DOC_FIELDS = [ "download_url", "checksum_sha256", "distribution", "version", ] OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list) self.release_dates: Dict[str, List[datetime]] = defaultdict(list) self.module_names: Set[str] = set() def process_release_page(self, page: List[Dict[str, Any]]): for entry in page: if "_source" not in entry or not all( k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS ): logger.warning( "Skipping release entry %s as some required fields are missing", entry.get("_source"), ) continue module_name = get_field_value(entry, "distribution") module_version = get_field_value(entry, "version") module_download_url = get_field_value(entry, "download_url") module_sha256_checksum = get_field_value(entry, "checksum_sha256") module_date = get_field_value(entry, "date") module_size = get_field_value(entry, "stat.size") module_author = get_field_value(entry, "author") module_author_fullname = get_field_value(entry, "metadata.author") release_name = get_field_value(entry, "name") module_version = get_module_version( module_name, module_version, release_name ) self.artifacts[module_name].append( { "url": module_download_url, "filename": module_download_url.split("/")[-1], "checksums": {"sha256": module_sha256_checksum}, "version": module_version, "length": module_size, } ) self.module_metadata[module_name].append( { "name": module_name, "version": module_version, "cpan_author": module_author, "author": ( module_author_fullname if module_author_fullname not in (None, "", "unknown") else module_author ), "date": module_date, "release_name": release_name, } ) self.release_dates[module_name].append(iso8601.parse_date(module_date)) self.module_names.add(module_name) def get_pages(self) -> Iterator[CpanListerPage]: """Yield an iterator which returns 'page'""" endpoint = f"{self.API_BASE_URL}/release/_search" scrollendpoint = f"{self.API_BASE_URL}/_search/scroll" size = 1000 res = self.http_request( endpoint, params={ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, "size": size, "scroll": "1m", }, ) data = res.json()["hits"]["hits"] self.process_release_page(data) _scroll_id = res.json()["_scroll_id"] while data: scroll_res = self.http_request( scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id} ) data = scroll_res.json()["hits"]["hits"] _scroll_id = scroll_res.json()["_scroll_id"] self.process_release_page(data) yield self.module_names def get_origins_from_page( self, module_names: CpanListerPage ) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for module_name in module_names: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), last_update=max(self.release_dates[module_name]), extra_loader_arguments={ "api_base_url": self.API_BASE_URL, "artifacts": self.artifacts[module_name], "module_metadata": self.module_metadata[module_name], }, ) diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 35e3d2b..728c6d3 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,150 +1,159 @@ # Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import json import logging import subprocess from typing import Dict, Iterator, List, Optional, Tuple import pkg_resources from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) CRAN_MIRROR = "https://cran.r-project.org" PageType = List[Dict[str, str]] class CRANLister(StatelessLister[PageType]): """ List all packages hosted on The Comprehensive R Archive Network. """ LISTER_NAME = "CRAN" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials + scheduler, + url=CRAN_MIRROR, + instance="cran", + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_pages(self) -> Iterator[PageType]: """ Yields a single page containing all CRAN packages info. """ yield read_cran_data() def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None seen_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) if origin_url in seen_urls: # prevent multiple listing of an origin, # most recent version will be listed first continue seen_urls.add(origin_url) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="cran", last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [ { "url": artifact_url, "version": package_info["Version"], "package": package_info["Package"], "checksums": {"md5": package_info["MD5sum"]}, } ] }, ) def read_cran_data() -> List[Dict[str, str]]: """ Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about R packages. For example:: [ { 'Package': 'A3', 'Version': '1.0.0' }, { 'Package': 'abbyyR', 'Version': '0.5.4' }, ... ] """ filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") logger.debug("Executing R script %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode("utf-8")) def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the tuple project url, artifact url """ package = package_info["Package"] version = package_info["Version"] origin_url = f"{CRAN_MIRROR}/package={package}" artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: packaged_at_str = package_info.get("Packaged", "") packaged_at = None if packaged_at_str: packaged_at_str = packaged_at_str.replace(" UTC", "") # Packaged field possible formats: # - "%Y-%m-%d %H:%M:%S[.%f] UTC; ", # - "%a %b %d %H:%M:%S %Y; " for date_format in ( "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%a %b %d %H:%M:%S %Y", ): try: packaged_at = datetime.strptime( packaged_at_str.split(";")[0], date_format, ).replace(tzinfo=timezone.utc) break except Exception: continue if packaged_at is None: logger.debug( "Could not parse %s package release date: %s", package_info["Package"], packaged_at_str, ) return packaged_at diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index eca9f10..6b8c94a 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,250 +1,256 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import csv from dataclasses import dataclass from datetime import datetime import json import logging from pathlib import Path import tarfile import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse import iso8601 from packaging.version import parse as parse_version from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CratesListerPage = List[Dict[str, Any]] @dataclass class CratesListerState: """Store lister state for incremental mode operations. 'index_last_update' represents the UTC time the crates.io database dump was started """ index_last_update: Optional[datetime] = None class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. It downloads a tar.gz archive which contains crates.io database table content as csv files which is automatically generated every 24 hours. Parsing two csv files we can list all Crates.io package names and their related versions. In incremental mode, it check each entry comparing their 'last_update' value with self.state.index_last_update """ LISTER_NAME = "crates" VISIT_TYPE = "crates" INSTANCE = "crates" BASE_URL = "https://crates.io" DB_DUMP_URL = "https://static.crates.io/db-dump.tar.gz" CRATE_FILE_URL_PATTERN = ( "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" ) CRATE_URL_PATTERN = "https://crates.io/crates/{crate}" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.BASE_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.index_metadata: Dict[str, str] = {} def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: index_last_update = d.get("index_last_update") if index_last_update is not None: d["index_last_update"] = iso8601.parse_date(index_last_update) return CratesListerState(**d) def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"index_last_update": None} index_last_update = state.index_last_update if index_last_update is not None: d["index_last_update"] = index_last_update.isoformat() return d def is_new(self, dt_str: str): """Returns True when dt_str is greater than self.state.index_last_update """ dt = iso8601.parse_date(dt_str) last = self.state.index_last_update return not last or (last is not None and last < dt) def get_and_parse_db_dump(self) -> Dict[str, Any]: """Download and parse csv files from db_dump_path. Returns a dict where each entry corresponds to a package name with its related versions. """ with tempfile.TemporaryDirectory() as tmpdir: file_name = self.DB_DUMP_URL.split("/")[-1] archive_path = Path(tmpdir) / file_name # Download the Db dump with self.http_request(self.DB_DUMP_URL, stream=True) as res: with open(archive_path, "wb") as out_file: for chunk in res.iter_content(chunk_size=1024): out_file.write(chunk) # Extract the Db dump db_dump_path = Path(str(archive_path).split(".tar.gz")[0]) tar = tarfile.open(archive_path) tar.extractall(path=db_dump_path) tar.close() csv.field_size_limit(1000000) (crates_csv_path,) = list(db_dump_path.glob("*/data/crates.csv")) (versions_csv_path,) = list(db_dump_path.glob("*/data/versions.csv")) (index_metadata_json_path,) = list(db_dump_path.rglob("*metadata.json")) with index_metadata_json_path.open("rb") as index_metadata_json: self.index_metadata = json.load(index_metadata_json) crates: Dict[str, Any] = {} with crates_csv_path.open() as crates_fd: crates_csv = csv.DictReader(crates_fd) for item in crates_csv: if self.is_new(item["updated_at"]): # crate 'id' as key crates[item["id"]] = { "name": item["name"], "updated_at": item["updated_at"], "versions": {}, } data: Dict[str, Any] = {} with versions_csv_path.open() as versions_fd: versions_csv = csv.DictReader(versions_fd) for version in versions_csv: if version["crate_id"] in crates.keys(): crate: Dict[str, Any] = crates[version["crate_id"]] crate["versions"][version["num"]] = version # crate 'name' as key data[crate["name"]] = crate return data def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: """Transform package version definition dict to a suitable page entry dict """ crate_file = self.CRATE_FILE_URL_PATTERN.format( crate=entry["name"], version=entry["version"] ) filename = urlparse(crate_file).path.split("/")[-1] return dict( name=entry["name"], version=entry["version"], checksum=entry["checksum"], yanked=True if entry["yanked"] == "t" else False, crate_file=crate_file, filename=filename, last_update=entry["updated_at"], ) def get_pages(self) -> Iterator[CratesListerPage]: """Each page is a list of crate versions with: - name: Name of the crate - version: Version - checksum: Checksum - yanked: Whether the package is yanked or not - crate_file: Url of the crate file - filename: File name of the crate file - last_update: Last update for that version """ # Fetch crates.io Db dump, then Parse the data. dataset = self.get_and_parse_db_dump() logger.debug("Found %s crates in crates_index", len(dataset)) # Each entry from dataset will correspond to a page for name, item in dataset.items(): page = [] # sort crate versions versions: list = sorted(item["versions"].keys(), key=parse_version) for version in versions: v = item["versions"][version] v["name"] = name v["version"] = version page.append(self.page_entry_dict(v)) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None url = self.CRATE_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] crates_metadata = [] for entry in page: # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 artifacts.append( { "version": entry["version"], "filename": entry["filename"], "url": entry["crate_file"], "checksums": { "sha256": entry["checksum"], }, } ) crates_metadata.append( { "version": entry["version"], "yanked": entry["yanked"], "last_update": entry["last_update"], } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=iso8601.parse_date(last_update), extra_loader_arguments={ "artifacts": artifacts, "crates_metadata": crates_metadata, }, ) def finalize(self) -> None: last: datetime = iso8601.parse_date(self.index_metadata["timestamp"]) if not self.state.index_last_update: self.state.index_last_update = last self.updated = True diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 940e453..23d520a 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -1,276 +1,282 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bz2 from collections import defaultdict from dataclasses import dataclass, field from email.utils import parsedate_to_datetime import gzip from itertools import product import logging import lzma import os from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple from urllib.parse import urljoin from debian.deb822 import Sources from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) decompressors: Dict[str, Callable[[Any], Any]] = { "gz": lambda f: gzip.GzipFile(fileobj=f), "bz2": bz2.BZ2File, "xz": lzma.LZMAFile, } Suite = str Component = str PkgName = str PkgVersion = str DebianOrigin = str DebianPageType = Iterator[Sources] @dataclass class DebianListerState: """State of debian lister""" package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) """Dictionary mapping a package name to all the versions found during last listing""" class DebianLister(Lister[DebianListerState, DebianPageType]): """ List source packages for a given debian or derivative distribution. The lister will create a snapshot for each package name from all its available versions. If a package snapshot is different from the last listing operation, it will be send to the scheduler that will create a loading task to archive newly found source code. Args: scheduler: instance of SchedulerInterface distribution: identifier of listed distribution (e.g. Debian, Ubuntu) mirror_url: debian package archives mirror URL suites: list of distribution suites to process components: list of package components to process """ LISTER_NAME = "debian" def __init__( self, scheduler: SchedulerInterface, distribution: str = "Debian", mirror_url: str = "http://deb.debian.org/debian/", suites: List[Suite] = ["stretch", "buster", "bullseye"], components: List[Component] = ["main", "contrib", "non-free"], credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=mirror_url, instance=distribution, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # to ensure urljoin will produce valid Sources URL if not self.url.endswith("/"): self.url += "/" self.distribution = distribution self.suites = suites self.components = components # will hold all listed origins info self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {} # will contain the lister state after a call to run self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} def state_from_dict(self, d: Dict[str, Any]) -> DebianListerState: return DebianListerState(package_versions={k: set(v) for k, v in d.items()}) def state_to_dict(self, state: DebianListerState) -> Dict[str, Any]: return {k: list(v) for k, v in state.package_versions.items()} def debian_index_urls( self, suite: Suite, component: Component ) -> Iterator[Tuple[str, str]]: """Return an iterator on possible Sources file URLs as multiple compression formats can be used.""" compression_exts = ("xz", "bz2", "gz") base_urls = [ urljoin(self.url, f"dists/{suite}/{component}/source/Sources"), urljoin(self.url, f"dists/{suite}/updates/{component}/source/Sources"), ] for base_url, ext in product(base_urls, compression_exts): yield (f"{base_url}.{ext}", ext) yield (base_url, "") def page_request(self, suite: Suite, component: Component) -> DebianPageType: """Return parsed package Sources file for a given debian suite and component.""" for url, compression in self.debian_index_urls(suite, component): try: response = self.http_request(url, stream=True) except HTTPError: pass else: last_modified = response.headers.get("Last-Modified") self.last_sources_update = ( parsedate_to_datetime(last_modified) if last_modified else None ) decompressor = decompressors.get(compression) if decompressor: data = decompressor(response.raw).readlines() else: data = response.raw.readlines() break else: data = "" logger.debug("Could not retrieve sources index for %s/%s", suite, component) return Sources.iter_paragraphs(data) def get_pages(self) -> Iterator[DebianPageType]: """Return an iterator on parsed debian package Sources files, one per combination of debian suite and component.""" for suite, component in product(self.suites, self.components): logger.debug( "Processing %s %s source packages info for %s component.", self.instance, suite, component, ) self.current_suite = suite self.current_component = component yield self.page_request(suite, component) def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin: """Return the origin url for the given package""" return f"deb://{self.instance}/packages/{package_name}" def get_origins_from_page(self, page: DebianPageType) -> Iterator[ListedOrigin]: """Convert a page of debian package sources into an iterator of ListedOrigin. Please note that the returned origins correspond to packages only listed for the first time in order to get an accurate origins counter in the statistics returned by the run method of the lister. Packages already listed in another page but with different versions will be put in cache by the method and updated ListedOrigin objects will be sent to the scheduler later in the commit_page method. Indeed as multiple debian suites can be processed, a similar set of package names can be listed for two different package source pages, only their version will differ, resulting in origins counted multiple times in lister statistics. """ assert self.lister_obj.id is not None origins_to_send = {} # iterate on each package source info for src_pkg in page: # gather package files info that will be used by the debian loader files: Dict[str, Dict[str, Any]] = defaultdict(dict) for field_ in src_pkg._multivalued_fields: if field_.startswith("checksums-"): sum_name = field_[len("checksums-") :] else: sum_name = "md5sum" if field_ in src_pkg: for entry in src_pkg[field_]: name = entry["name"] files[name]["name"] = name files[name]["size"] = int(entry["size"], 10) files[name][sum_name] = entry[sum_name] files[name]["uri"] = os.path.join( self.url, src_pkg["Directory"], name ) # extract package name and version package_name = src_pkg["Package"] package_version = src_pkg["Version"] # build origin url origin_url = self.origin_url_for_package(package_name) # create package version key as expected by the debian loader package_version_key = ( f"{self.current_suite}/{self.current_component}/{package_version}" ) # this is the first time a package is listed if origin_url not in self.listed_origins: # create a ListedOrigin object for it that can be later # updated with new package versions info self.listed_origins[origin_url] = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="deb", extra_loader_arguments={"packages": {}}, last_update=self.last_sources_update, ) # init set that will contain all listed package versions self.package_versions[package_name] = set() # origin will be yielded at the end of that method origins_to_send[origin_url] = self.listed_origins[origin_url] # update package versions data in parameter that will be provided # to the debian loader self.listed_origins[origin_url].extra_loader_arguments["packages"].update( { package_version_key: { "name": package_name, "version": package_version, "files": files, } } ) if self.listed_origins[origin_url].last_update is None or ( self.last_sources_update is not None and self.last_sources_update # type: ignore > self.listed_origins[origin_url].last_update ): # update debian package last update if current processed sources index # has a greater modification date self.listed_origins[origin_url].last_update = self.last_sources_update # add package version key to the set of found versions self.package_versions[package_name].add(package_version_key) # package has already been listed during a previous listing process if package_name in self.state.package_versions: new_versions = ( self.package_versions[package_name] - self.state.package_versions[package_name] ) # no new versions so far, no need to send the origin to the scheduler if not new_versions: origins_to_send.pop(origin_url, None) logger.debug("Found %s new packages.", len(origins_to_send)) logger.debug( "Current total number of listed packages is equal to %s.", len(self.listed_origins), ) yield from origins_to_send.values() def finalize(self): # set mapping between listed package names and versions as lister state self.state.package_versions = self.package_versions self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py index 8f3dced..34712b3 100644 --- a/swh/lister/fedora/lister.py +++ b/swh/lister/fedora/lister.py @@ -1,259 +1,265 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass, field from datetime import datetime, timezone import logging -from typing import Any, Dict, Iterator, List, Set, Type +from typing import Any, Dict, Iterator, List, Optional, Set, Type from urllib.error import HTTPError from urllib.parse import urljoin import repomd from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import Lister logger = logging.getLogger(__name__) Release = int Edition = str PkgName = str PkgVersion = str FedoraOrigin = str FedoraPageType = Type[repomd.Repo] """Each page is a list of packages from a given Fedora (release, edition) pair""" def get_editions(release: Release) -> List[Edition]: """Get list of editions for a given release.""" # Ignore dirs that don't contain .rpm files: # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue if release < 20: return ["Everything", "Fedora"] elif release < 28: return ["Everything", "Server", "Workstation"] else: return ["Everything", "Server", "Workstation", "Modular"] def get_last_modified(pkg: repomd.Package) -> datetime: """Get timezone aware last modified time in UTC from RPM package metadata.""" ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) def get_checksums(pkg: repomd.Package) -> Dict[str, str]: """Get checksums associated to rpm archive.""" cs = pkg._element.find("common:checksum", namespaces=repomd._ns) cs_type = cs.get("type") if cs_type == "sha": cs_type = "sha1" return {cs_type: cs.text} @dataclass class FedoraListerState: """State of Fedora lister""" package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) """Dictionary mapping a package name to all the versions found during last listing""" class FedoraLister(Lister[FedoraListerState, FedoraPageType]): """ List source packages for given Fedora releases. The lister will create a snapshot for each package name from all its available versions. If a package snapshot is different from the last listing operation, it will be sent to the scheduler that will create a loading task to archive newly found source code. Args: scheduler: instance of SchedulerInterface url: fedora package archives mirror URL releases: list of fedora releases to process """ LISTER_NAME = "fedora" def __init__( self, scheduler: SchedulerInterface, instance: str = "fedora", url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", releases: List[Release] = [34, 35, 36], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials={}, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.releases = releases self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} "will hold all listed origins info" self.origins_to_send: Set[FedoraOrigin] = set() "will hold updated origins since last listing" self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} "will contain the lister state after a call to run" self.last_page = False def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: return {k: list(v) for k, v in state.package_versions.items()} def page_request(self, release: Release, edition: Edition) -> FedoraPageType: """Return parsed packages for a given fedora release.""" index_url = urljoin( self.url, f"{release}/{edition}/source/SRPMS/" if release < 24 else f"{release}/{edition}/source/tree/", ) repo = repomd.load(index_url) # throws error if no repomd.xml is not found self.last_page = ( release == self.releases[-1] and edition == get_editions(release)[-1] ) logger.debug( "Fetched metadata from url: %s, found %d packages", index_url, len(repo) ) # TODO: Extract more fields like "provides" and "requires" from *primary.xml # as extrinsic metadata using the pkg._element.findtext method return repo def get_pages(self) -> Iterator[FedoraPageType]: """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" for release in self.releases: for edition in get_editions(release): logger.debug("Listing fedora release %s edition %s", release, edition) self.current_release = release self.current_edition = edition try: yield self.page_request(release, edition) except HTTPError as http_error: if http_error.getcode() == 404: logger.debug( "No packages metadata found for fedora release %s edition %s", release, edition, ) continue raise def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: """Return the origin url for the given package""" return f"https://src.fedoraproject.org/rpms/{package_name}" def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: """Convert a page of fedora package sources into an iterator of ListedOrigin.""" assert self.lister_obj.id is not None origins_to_send = set() # iterate on each package's metadata for pkg_metadata in page: # extract package metadata package_name = pkg_metadata.name package_version = pkg_metadata.vr package_version_split = package_version.split(".") if package_version_split[-1].startswith("fc"): # remove trailing ".fcXY" in version for the rpm loader to avoid # creating multiple releases targeting same directory package_version = ".".join(package_version_split[:-1]) package_build_time = get_last_modified(pkg_metadata) package_download_path = pkg_metadata.location # build origin url origin_url = self.origin_url_for_package(package_name) # create package version key as expected by the fedora (rpm) loader package_version_key = ( f"fedora{self.current_release}/{self.current_edition}/" f"{package_version}" ).lower() # this is the first time a package is listed if origin_url not in self.listed_origins: # create a ListedOrigin object for it that can be later # updated with new package versions info self.listed_origins[origin_url] = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="rpm", extra_loader_arguments={"packages": {}}, last_update=package_build_time, ) # init set that will contain all listed package versions self.package_versions[package_name] = set() # origin will be yielded at the end of that method origins_to_send.add(origin_url) # update package metadata in parameter that will be provided # to the rpm loader self.listed_origins[origin_url].extra_loader_arguments["packages"][ package_version_key ] = { "name": package_name, "version": package_version, "url": urljoin(page.baseurl, package_download_path), "buildTime": package_build_time.isoformat(), "checksums": get_checksums(pkg_metadata), } last_update = self.listed_origins[origin_url].last_update if last_update is not None and package_build_time > last_update: self.listed_origins[origin_url].last_update = package_build_time # add package version key to the set of found versions self.package_versions[package_name].add(package_version_key) # package has already been listed during a previous listing process if package_name in self.state.package_versions: new_versions = ( self.package_versions[package_name] - self.state.package_versions[package_name] ) # no new versions so far, no need to send the origin to the scheduler if not new_versions: origins_to_send.remove(origin_url) logger.debug( "Found %s packages to update (new ones or packages with new versions).", len(origins_to_send), ) logger.debug( "Current total number of listed packages is equal to %s.", len(self.listed_origins), ) # yield from origins_to_send.values() self.origins_to_send.update(origins_to_send) if self.last_page: # yield listed origins when all fedora releases and editions processed yield from [ self.listed_origins[origin_url] for origin_url in self.origins_to_send ] def finalize(self): # set mapping between listed package names and versions as lister state self.state.package_versions = self.package_versions self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 5728727..738c516 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,205 +1,211 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import datetime import logging from typing import Any, Dict, Iterator, List, Optional, Set from urllib.parse import parse_qs, urlparse import iso8601 from swh.core.github.utils import MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class GitHubListerState: """State of the GitHub lister""" last_seen_id: int = 0 """Numeric id of the last repository listed on an incremental pass""" class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """List origins from GitHub. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_id` stored in the scheduler backend. Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in that mode, the lister finds the origins present in the range **excluding** `first_id` and **including** `last_id`. In this mode, the lister can overrun the `last_id`: it will always record all the origins seen in a given page. As the lister is fully idempotent, this is not a practical problem. Once relisting completes, the lister state in the scheduler backend is not updated. When the config contains a set of credentials, we shuffle this list at the beginning of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the same token over and over again, until its rate limit runs out. Once that happens, we switch to the next token over in our shuffled list. When a request fails with a rate limit exception for all tokens, we pause the listing until the largest value for X-Ratelimit-Reset over all tokens. When the credentials aren't set in the lister config, the lister can run in anonymous mode too (e.g. for testing purposes). .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits Args: first_id: the id of the first repo to list last_id: stop listing after seeing a repo with an id higher than this value. """ # noqa: B950 LISTER_NAME = "github" API_URL = "https://api.github.com/repositories" PAGE_SIZE = 1000 def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, first_id: Optional[int] = None, last_id: Optional[int] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance="github", with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.first_id = first_id self.last_id = last_id self.relisting = self.first_id is not None or self.last_id is not None def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: return GitHubListerState(**d) def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]: return asdict(state) def get_pages(self) -> Iterator[List[Dict[str, Any]]]: current_id = 0 if self.first_id is not None: current_id = self.first_id elif self.state is not None: current_id = self.state.last_seen_id current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}" while self.last_id is None or current_id < self.last_id: logger.debug("Getting page %s", current_url) try: assert self.github_session is not None response = self.github_session.request(current_url) except MissingRateLimitReset: # Give up break # We've successfully retrieved a (non-ratelimited) `response`. We # still need to check it for validity. if response.status_code != 200: logger.warning( "Got unexpected status_code %s: %s", response.status_code, response.content, ) break yield response.json() if "next" not in response.links: # No `next` link, we've reached the end of the world logger.debug( "No next link found in the response headers, all caught up" ) break # GitHub strongly advises to use the next link directly. We still # parse it to get the id of the last repository we've reached so # far. next_url = response.links["next"]["url"] parsed_url = urlparse(next_url) if not parsed_url.query: logger.warning("Failed to parse url %s", next_url) break parsed_query = parse_qs(parsed_url.query) current_id = int(parsed_query["since"][0]) current_url = next_url def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of GitHub repositories into a list of ListedOrigins. This records the html_url, as well as the pushed_at value if it exists. """ assert self.lister_obj.id is not None seen_in_page: Set[str] = set() for repo in page: if not repo: # null repositories in listings happen sometimes... continue if repo["html_url"] in seen_in_page: continue seen_in_page.add(repo["html_url"]) pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: pushed_at = iso8601.parse_date(pushed_at_str) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["html_url"], visit_type="git", last_update=pushed_at, ) def commit_page(self, page: List[Dict[str, Any]]): """Update the currently stored state using the latest listed page""" if self.relisting: # Don't update internal state when relisting return if not page: # Sometimes, when you reach the end of the world, GitHub returns an empty # page of repositories return last_id = page[-1]["id"] if last_id > self.state.last_seen_id: self.state.last_seen_id = last_id def finalize(self): if self.relisting: return # Pull fresh lister state from the scheduler backend scheduler_state = self.get_state_from_scheduler() # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if self.state.last_seen_id > scheduler_state.last_seen_id: self.updated = True diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 7823ee2..3ad2bfd 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,269 +1,275 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional, Tuple from urllib.parse import parse_qs, urlencode, urlparse import iso8601 from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log from swh.lister.pattern import CredentialsType, Lister from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) # Some instance provides hg_git type which can be ingested as hg origins VCS_MAPPING = {"hg_git": "hg"} @dataclass class GitLabListerState: """State of the GitLabLister""" last_seen_next_link: Optional[str] = None """Last link header (not visited yet) during an incremental pass """ Repository = Dict[str, Any] @dataclass class PageResult: """Result from a query to a gitlab project api page.""" repositories: Optional[Tuple[Repository, ...]] = None next_page: Optional[str] = None def _if_rate_limited(retry_state) -> bool: """Custom tenacity retry predicate for handling HTTP responses with status code 403 with specific ratelimit header. """ attempt = retry_state.outcome if attempt.failed: exc = attempt.exception() return ( isinstance(exc, HTTPError) and exc.response.status_code == codes.forbidden and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0 ) or is_retryable_exception(exc) return False def _parse_id_after(url: Optional[str]) -> Optional[int]: """Given an url, extract a return the 'id_after' query parameter associated value or None. This is the the repository id used for pagination purposes. """ if not url: return None # link: https://${project-api}/?...&id_after=2x... query_data = parse_qs(urlparse(url).query) page = query_data.get("id_after") if page and len(page) > 0: return int(page[0]) return None class GitLabLister(Lister[GitLabListerState, PageResult]): """List origins for a gitlab instance. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_next_link` stored in the scheduler backend. Args: scheduler: a scheduler instance url: the api v4 url of the gitlab instance to visit (e.g. https://gitlab.com/api/v4/) instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...), url network location will be used if not provided incremental: defines if incremental listing is activated or not ignored_project_prefixes: List of prefixes of project paths to ignore """ def __init__( self, scheduler, url: str, name: Optional[str] = "gitlab", instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = False, ignored_project_prefixes: Optional[List[str]] = None, ): if name is not None: self.LISTER_NAME = name super().__init__( scheduler=scheduler, url=url.rstrip("/"), instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.last_page: Optional[str] = None self.per_page = 100 self.ignored_project_prefixes: Optional[Tuple[str, ...]] = None if ignored_project_prefixes: self.ignored_project_prefixes = tuple(ignored_project_prefixes) self.session.headers.update({"Accept": "application/json"}) if len(self.credentials) > 0: cred = random.choice(self.credentials) logger.info( "Using %s credentials from user %s", self.instance, cred["username"] ) api_token = cred["password"] if api_token: self.session.headers["Authorization"] = f"Bearer {api_token}" def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState: return GitLabListerState(**d) def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def get_page_result(self, url: str) -> PageResult: logger.debug("Fetching URL %s", url) response = self.session.get(url) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) # GitLab API can return errors 500 when listing projects. # https://gitlab.com/gitlab-org/gitlab/-/issues/262629 # To avoid ending the listing prematurely, skip buggy URLs and move # to next pages. if response.status_code == 500: id_after = _parse_id_after(url) assert id_after is not None while True: next_id_after = id_after + self.per_page url = url.replace(f"id_after={id_after}", f"id_after={next_id_after}") response = self.session.get(url) if response.status_code == 200: break else: id_after = next_id_after else: response.raise_for_status() repositories: Tuple[Repository, ...] = tuple(response.json()) if hasattr(response, "links") and response.links.get("next"): next_page = response.links["next"]["url"] else: next_page = None return PageResult(repositories, next_page) def page_url(self, id_after: Optional[int] = None) -> str: parameters = { "pagination": "keyset", "order_by": "id", "sort": "asc", "simple": "true", "per_page": f"{self.per_page}", } if id_after is not None: parameters["id_after"] = str(id_after) return f"{self.url}/projects?{urlencode(parameters)}" def get_pages(self) -> Iterator[PageResult]: next_page: Optional[str] if self.incremental and self.state and self.state.last_seen_next_link: next_page = self.state.last_seen_next_link else: next_page = self.page_url() while next_page: self.last_page = next_page page_result = self.get_page_result(next_page) yield page_result next_page = page_result.next_page def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None repositories = page_result.repositories if page_result.repositories else [] for repo in repositories: if self.ignored_project_prefixes and repo["path_with_namespace"].startswith( self.ignored_project_prefixes ): continue visit_type = repo.get("vcs_type", "git") visit_type = VCS_MAPPING.get(visit_type, visit_type) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["http_url_to_repo"], visit_type=visit_type, last_update=iso8601.parse_date(repo["last_activity_at"]), ) def commit_page(self, page_result: PageResult) -> None: """Update currently stored state using the latest listed "next" page if relevant. Relevancy is determined by the next_page link whose 'page' id must be strictly superior to the currently stored one. Note: this is a noop for full listing mode """ if self.incremental: # link: https://${project-api}/?...&page=2x... next_page = page_result.next_page if not next_page and self.last_page: next_page = self.last_page if next_page: id_after = _parse_id_after(next_page) previous_next_page = self.state.last_seen_next_link previous_id_after = _parse_id_after(previous_next_page) if previous_next_page is None or ( previous_id_after and id_after and previous_id_after < id_after ): self.state.last_seen_next_link = next_page def finalize(self) -> None: """finalize the lister state when relevant (see `fn:commit_page` for details) Note: this is a noop for full listing mode """ next_page = self.state.last_seen_next_link if self.incremental and next_page: # link: https://${project-api}/?...&page=2x... next_id_after = _parse_id_after(next_page) scheduler_state = self.get_state_from_scheduler() previous_next_id_after = _parse_id_after( scheduler_state.last_seen_next_link ) if (not previous_next_id_after and next_id_after) or ( previous_next_id_after and next_id_after and previous_next_id_after < next_id_after ): self.updated = True diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 65eca1f..721bdc2 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,75 +1,81 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Iterator, Mapping, Optional import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister from .tree import GNUTree logger = logging.getLogger(__name__) GNUPageType = Mapping[str, Any] class GNULister(StatelessLister[GNUPageType]): """ List all GNU projects and associated artifacts. """ LISTER_NAME = "GNU" GNU_FTP_URL = "https://ftp.gnu.org" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GNU_FTP_URL, instance="GNU", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # no side-effect calls in constructor, if extra state is needed, as preconized # by the pattern docstring, this must happen in the get_pages method. self.gnu_tree: Optional[GNUTree] = None def get_pages(self) -> Iterator[GNUPageType]: """ Yield a single page listing all GNU projects. """ # first fetch the manifest to parse self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") yield self.gnu_tree.projects def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: """ Iterate on all GNU projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None assert self.gnu_tree is not None artifacts = self.gnu_tree.artifacts for project_name, project_info in page.items(): origin_url = project_info["url"] last_update = iso8601.parse_date(project_info["time_modified"]) logger.debug("Found origin %s last updated on %s", origin_url, last_update) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", last_update=last_update, extra_loader_arguments={"artifacts": artifacts[project_name]}, ) diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py index ce8a398..cdc5576 100644 --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -1,208 +1,214 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional, Tuple from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse import iso8601 from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) Repo = Dict[str, Any] @dataclass class GogsListerPage: repos: Optional[List[Repo]] = None next_link: Optional[str] = None @dataclass class GogsListerState: last_seen_next_link: Optional[str] = None """Last link header (could be already visited) during an incremental pass.""" last_seen_repo_id: Optional[int] = None """Last repo id seen during an incremental pass.""" def _parse_page_id(url: Optional[str]) -> int: """Parse the page id from a Gogs page url.""" if url is None: return 0 return int(parse_qs(urlparse(url).query)["page"][0]) class GogsLister(Lister[GogsListerState, GogsListerPage]): """List origins from the Gogs Gogs API documentation: https://github.com/gogs/docs-api The API may be protected behind authentication so credentials/API tokens can be provided. The lister supports pagination and provides next page URL through the 'next' value of the 'Link' header. The default value for page size ('limit') is 10 but the maximum allowed value is 50. Api can usually be found at the location: https:///api/v1/repos/search """ LISTER_NAME = "gogs" VISIT_TYPE = "git" REPO_LIST_PATH = "repos/search" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, api_token: Optional[str] = None, page_size: int = 50, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.query_params = { "limit": page_size, } self.api_token = api_token if self.api_token is None: if len(self.credentials) > 0: cred = random.choice(self.credentials) username = cred.get("username") self.api_token = cred["password"] logger.info("Using authentication credentials from user %s", username) self.session.headers.update({"Accept": "application/json"}) if self.api_token: self.session.headers["Authorization"] = f"token {self.api_token}" else: logger.warning( "No authentication token set in configuration, using anonymous mode" ) def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: return GogsListerState(**d) def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: return asdict(state) def page_request( self, url: str, params: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: logger.debug("Fetching URL %s with params %s", url, params) try: response = self.http_request(url, params=params) except HTTPError as http_error: if ( http_error.response.status_code == 500 ): # Temporary hack for skipping fatal repos (T4423) url_parts = urlparse(url) query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) query.update({"page": _parse_page_id(url) + 1}) next_page_link = url_parts._replace(query=urlencode(query)).geturl() body: Dict[str, Any] = {"data": []} links = {"next": {"url": next_page_link}} return body, links else: raise return response.json(), response.links @classmethod def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: fields_filter = ["id", "clone_url", "updated_at"] return [{k: r[k] for k in fields_filter} for r in body["data"]] def get_pages(self) -> Iterator[GogsListerPage]: page_id = 1 if self.state.last_seen_next_link is not None: page_id = _parse_page_id(self.state.last_seen_next_link) # base with trailing slash, path without leading slash for urljoin next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) assert next_link is not None body, links = self.page_request( next_link, {**self.query_params, "page": page_id} ) while next_link is not None: repos = self.extract_repos(body) if "next" in links: next_link = links["next"]["url"] else: next_link = None # Happens for the last page yield GogsListerPage(repos=repos, next_link=next_link) if next_link is not None: body, links = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None assert page.repos is not None for r in page.repos: last_update = iso8601.parse_date(r["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=r["clone_url"], last_update=last_update, ) def commit_page(self, page: GogsListerPage) -> None: last_seen_next_link = page.next_link page_id = _parse_page_id(last_seen_next_link) state_page_id = _parse_page_id(self.state.last_seen_next_link) if page_id > state_page_id: self.state.last_seen_next_link = last_seen_next_link if (page.repos is not None) and len(page.repos) > 0: self.state.last_seen_repo_id = page.repos[-1]["id"] def finalize(self) -> None: scheduler_state = self.get_state_from_scheduler() state_page_id = _parse_page_id(self.state.last_seen_next_link) scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link) state_last_repo_id = self.state.last_seen_repo_id or 0 scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0 if (state_page_id >= scheduler_page_id) and ( state_last_repo_id > scheduler_last_repo_id ): self.updated = True # Marked updated only if it finds new repos diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 10e5935..36a247b 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -1,164 +1,170 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class GolangStateType: last_seen: Optional[datetime] = None """Last timestamp of a package version we have saved. Used as a starting point for an incremental listing.""" GolangPageType = List[Dict[str, Any]] class GolangLister(Lister[GolangStateType, GolangPageType]): """ List all Golang modules and send associated origins to scheduler. The lister queries the Golang module index, whose documentation can be found at https://index.golang.org """ GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" # `limit` seems to be... limited to 2000. GOLANG_MODULES_INDEX_LIMIT = 2000 LISTER_NAME = "golang" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GOLANG_MODULES_INDEX_URL, instance=self.LISTER_NAME, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) self.incremental = incremental def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: as_string = d.get("last_seen") last_seen = iso8601.parse_date(as_string) if as_string is not None else None return GolangStateType(last_seen=last_seen) def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: return { "last_seen": state.last_seen.isoformat() if state.last_seen is not None else None } def finalize(self): if self.incremental and self.state.last_seen is not None: scheduler_state = self.get_state_from_scheduler() if ( scheduler_state.last_seen is None or self.state.last_seen > scheduler_state.last_seen ): self.updated = True def api_request(self, url: str) -> List[str]: response = self.http_request(url) return response.text.split() def get_single_page( self, since: Optional[datetime] = None ) -> Tuple[GolangPageType, Optional[datetime]]: """Return a page from the API and the timestamp of its last entry. Since all entries are sorted by chronological order, the timestamp is useful both for pagination and later for incremental runs.""" url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" if since is not None: # The Golang index does not understand `+00:00` for some reason # and expects the "timezone zero" notation instead. This works # because all times are UTC. utc_offset = since.utcoffset() assert ( utc_offset is not None and utc_offset.total_seconds() == 0 ), "Non-UTC datetime" as_date = since.isoformat().replace("+00:00", "Z") url = f"{url}&since={as_date}" entries = self.api_request(url) page: GolangPageType = [] if not entries: return page, since for as_json in entries: entry = json.loads(as_json) timestamp = iso8601.parse_date(entry["Timestamp"]) # We've already parsed it and we'll need the datetime later, save it entry["Timestamp"] = timestamp page.append(entry) # The index is guaranteed to be sorted in chronological order since = timestamp return page, since def get_pages(self) -> Iterator[GolangPageType]: since = None if self.incremental: since = self.state.last_seen page, since = self.get_single_page(since=since) if since == self.state.last_seen: # The index returns packages whose timestamp are greater or # equal to the date provided as parameter, which will create # an infinite loop if not stopped here. return [], since if since is not None: self.state.last_seen = since while page: yield page page, since = self.get_single_page(since=since) if since == self.state.last_seen: return [], since if since is not None: self.state.last_seen = since def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: """ Iterate on all Golang projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None for module in page: path = module["Path"] # The loader will be expected to use the golang proxy to do the # actual downloading. We're using `pkg.go.dev` so that the URL points # to somewhere useful for a human instead of an (incomplete) API path. origin_url = f"https://pkg.go.dev/{path}" # Since the Go index lists versions and not just packages, there will # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, # so only the last timestamp will be used, with no duplicates. # Performance should not be an issue as they are sent to the db in bulk. yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="golang", last_update=module["Timestamp"], ) diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py index 04fb6f2..a86ff67 100644 --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -1,144 +1,150 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime, timezone import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. HackageListerPage = List[Dict[str, Any]] @dataclass class HackageListerState: """Store lister state for incremental mode operations""" last_listing_date: Optional[datetime] = None """Last date when Hackage lister was executed""" class HackageLister(Lister[HackageListerState, HackageListerPage]): """List Hackage (The Haskell Package Repository) origins.""" LISTER_NAME = "hackage" VISIT_TYPE = "hackage" INSTANCE = "hackage" BASE_URL = "https://hackage.haskell.org/" PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search" PACKAGE_INFO_URL_PATTERN = "{base_url}package/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, url: Optional[str] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url if url else self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Ensure to set this with same value as the http api search endpoint use # (50 as of august 2022) self.page_size: int = 50 self.listing_date = datetime.now().astimezone(tz=timezone.utc) def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState: last_listing_date = d.get("last_listing_date") if last_listing_date is not None: d["last_listing_date"] = iso8601.parse_date(last_listing_date) return HackageListerState(**d) def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"last_listing_date": None} last_listing_date = state.last_listing_date if last_listing_date is not None: d["last_listing_date"] = last_listing_date.isoformat() return d def get_pages(self) -> Iterator[HackageListerPage]: """Yield an iterator which returns 'page' It uses the http api endpoint `https://hackage.haskell.org/packages/search` to get a list of package names from which we build an origin url. Results are paginated. """ # Search query sq = "(deprecated:any)" if self.state.last_listing_date: last_str = ( self.state.last_listing_date.astimezone(tz=timezone.utc) .date() .isoformat() ) # Incremental mode search query sq += "(lastUpload >= %s)" % last_str params = { "page": 0, "sortColumn": "default", "sortDirection": "ascending", "searchQuery": sq, } data = self.http_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), method="POST", json=params, ).json() if data.get("pageContents"): nb_entries: int = data["numberOfResults"] (nb_pages, remainder) = divmod(nb_entries, self.page_size) if remainder: nb_pages += 1 # First page yield data["pageContents"] # Next pages for page in range(1, nb_pages): params["page"] = page data = self.http_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), method="POST", json=params, ).json() yield data["pageContents"] def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for entry in page: pkgname = entry["name"]["display"] last_update = iso8601.parse_date(entry["lastUpload"]) url = self.PACKAGE_INFO_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, ) def finalize(self) -> None: self.state.last_listing_date = self.listing_date self.updated = True diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index e9c36fa..b9daa18 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,209 +1,215 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, Optional, Tuple import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection from tenacity.before_sleep import before_sleep_log from swh.lister.utils import http_retry, retry_if_exception from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) VcsType = str LaunchpadPageType = Tuple[VcsType, Collection] SUPPORTED_VCS_TYPES = ("git", "bzr") @dataclass class LaunchpadListerState: """State of Launchpad lister""" git_date_last_modified: Optional[datetime] = None """modification date of last updated git repository since last listing""" bzr_date_last_modified: Optional[datetime] = None """modification date of last updated bzr repository since last listing""" def origin(vcs_type: str, repo: Any) -> str: """Determine the origin url out of a repository with a given vcs_type""" return repo.git_https_url if vcs_type == "git" else repo.web_link def retry_if_restful_error(retry_state): return retry_if_exception(retry_state, lambda e: isinstance(e, RestfulError)) class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ List repositories from Launchpad (git or bzr). Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { "git": None, "bzr": None, } def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: for vcs_type in SUPPORTED_VCS_TYPES: key = f"{vcs_type}_date_last_modified" date_last_modified = d.get(key) if date_last_modified is not None: d[key] = iso8601.parse_date(date_last_modified) return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {} for vcs_type in SUPPORTED_VCS_TYPES: attribute_name = f"{vcs_type}_date_last_modified" d[attribute_name] = None if hasattr(state, attribute_name): date_last_modified = getattr(state, attribute_name) if date_last_modified is not None: d[attribute_name] = date_last_modified.isoformat() return d @http_retry( retry=retry_if_restful_error, before_sleep=before_sleep_log(logger, logging.WARNING), ) def _page_request( self, launchpad, vcs_type: str, date_last_modified: Optional[datetime] ) -> Optional[Collection]: """Querying the page of results for a given vcs_type since the date_last_modified. If some issues occurs, this will deal with the retrying policy. """ get_vcs_fns = { "git": launchpad.git_repositories.getRepositories, "bzr": launchpad.branches.getBranches, } return get_vcs_fns[vcs_type]( order_by="most neglected first", modified_since_date=date_last_modified, ) def get_pages(self) -> Iterator[LaunchpadPageType]: """ Yields an iterator on all git/bzr repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) if self.incremental: self.date_last_modified = { "git": self.state.git_date_last_modified, "bzr": self.state.bzr_date_last_modified, } for vcs_type in SUPPORTED_VCS_TYPES: try: result = self._page_request( launchpad, vcs_type, self.date_last_modified[vcs_type] ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e) result = None if not result: continue yield vcs_type, result def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None vcs_type, repos = page try: for repo in repos: origin_url = origin(vcs_type, repo) # filter out origins with invalid URL if not origin_url.startswith("https://"): continue last_update = repo.date_last_modified self.date_last_modified[vcs_type] = last_update logger.debug( "Found origin %s with type %s last updated on %s", origin_url, vcs_type, last_update, ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=origin_url, last_update=last_update, ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e) def finalize(self) -> None: git_date_last_modified = self.date_last_modified["git"] bzr_date_last_modified = self.date_last_modified["bzr"] if git_date_last_modified is None and bzr_date_last_modified is None: return if self.incremental and ( self.state.git_date_last_modified is None or ( git_date_last_modified is not None and git_date_last_modified > self.state.git_date_last_modified ) ): self.state.git_date_last_modified = git_date_last_modified if self.incremental and ( self.state.bzr_date_last_modified is None or ( bzr_date_last_modified is not None and bzr_date_last_modified > self.state.bzr_date_last_modified ) ): self.state.bzr_date_last_modified = self.date_last_modified["bzr"] self.updated = True diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 195a8a3..8dc702c 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,403 +1,409 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup import lxml import requests from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) self.jar_origins: Dict[str, ListedOrigin] = {} def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None try: response = self.http_request(self.INDEX_URL, stream=True) except requests.HTTPError: logger.error("Index %s not found, stopping", self.INDEX_URL) raise # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom_url in out_pom: try: response = self.http_request(pom_url) parsed_pom = BeautifulSoup(response.content, "xml") project = parsed_pom.find("project") if project is None: continue scm = project.find("scm") if scm is not None: connection = scm.find("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom_url], "url": connection.text, } logger.debug( "* Yielding pom %s: %s", pom_url, artifact_metadata_d ) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom_url) else: logger.debug("No scm in pom %s", pom_url) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom_url, ) except lxml.etree.Error as error: logger.info("Could not parse POM %s XML: %s.", pom_url, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the page is scm. Try and detect an scm/vcs repository. Note that official format is in the form: scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put the repo url (without the "scm:type"), so we have to check against the content to extract the type and url properly. Raises AssertionError when the type of the page is not 'scm' Returns ListedOrigin with proper canonical scm url (for github) if any is found, None otherwise. """ assert page["type"] == "scm" visit_type: Optional[str] = None url: Optional[str] = None m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is None: return None scm_type = m_scm.group("type") if scm_type and scm_type in SUPPORTED_SCM_TYPES: url = m_scm.group("url") visit_type = scm_type elif page["url"].endswith(".git"): url = page["url"].lstrip("scm:") visit_type = "git" else: return None if url and visit_type == "git": assert self.github_session is not None # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) if not url: return None assert visit_type is not None assert self.lister_obj.id is not None return ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=visit_type, ) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Maven repositories into a list of ListedOrigins.""" if page["type"] == "scm": listed_origin = self.get_scm(page) if listed_origin: yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 3e410aa..3440a8e 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -1,566 +1,572 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """NixGuix lister definition. This lists artifacts out of manifest for Guix or Nixpkgs manifests. Artifacts can be of types: - upstream git repository (NixOS/nixpkgs, Guix) - VCS repositories (svn, git, hg, ...) - unique file - unique tarball """ import base64 import binascii from dataclasses import dataclass from enum import Enum import logging from pathlib import Path import random import re from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import parse_qsl, urlparse import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT from swh.lister import TARBALL_EXTENSIONS from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) # By default, ignore binary files and archives containing binaries DEFAULT_EXTENSIONS_TO_IGNORE = [ "AppImage", "bin", "exe", "iso", "linux64", "msi", "png", "dic", "deb", "rpm", ] class ArtifactNatureUndetected(ValueError): """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" pass class ArtifactNatureMistyped(ValueError): """Raised when a remote artifact is neither a tarball nor a file. Error of this type are' probably a misconfiguration in the manifest generation that badly typed a vcs repository. """ pass class ArtifactWithoutExtension(ValueError): """Raised when an artifact nature cannot be determined by its name.""" pass class ChecksumsComputation(Enum): """The possible artifact types listed out of the manifest.""" STANDARD = "standard" """Standard checksums (e.g. sha1, sha256, ...) on the tarball or file.""" NAR = "nar" """The hash is computed over the NAR archive dump of the output (e.g. uncompressed directory.)""" MAPPING_CHECKSUMS_COMPUTATION = { "flat": ChecksumsComputation.STANDARD, "recursive": ChecksumsComputation.NAR, } """Mapping between the outputHashMode from the manifest and how to compute checksums.""" @dataclass class Artifact: """Metadata information on Remote Artifact with url (tarball or file).""" origin: str """Canonical url retrieve the tarball artifact.""" visit_type: str """Either 'tar' or 'file' """ fallback_urls: List[str] """List of urls to retrieve tarball artifact if canonical url no longer works.""" checksums: Dict[str, str] """Integrity hash converted into a checksum dict.""" checksums_computation: ChecksumsComputation """Checksums computation mode to provide to loaders (e.g. nar, standard, ...)""" @dataclass class VCS: """Metadata information on VCS.""" origin: str """Origin url of the vcs""" type: str """Type of (d)vcs, e.g. svn, git, hg, ...""" ref: Optional[str] = None """Reference either a svn commit id, a git commit, ...""" class ArtifactType(Enum): """The possible artifact types listed out of the manifest.""" ARTIFACT = "artifact" VCS = "vcs" PageResult = Tuple[ArtifactType, Union[Artifact, VCS]] VCS_SUPPORTED = ("git", "svn", "hg") # Rough approximation of what we can find of mimetypes for tarballs "out there" POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+") def url_endswith( urlparsed, extensions: List[str], raise_when_no_extension: bool = True ) -> bool: """Determine whether urlparsed ends with one of the extensions passed as parameter. This also account for the edge case of a filename with only a version as name (so no extension in the end.) Raises: ArtifactWithoutExtension in case no extension is available and raise_when_no_extension is True (the default) """ paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] if raise_when_no_extension and not any(path.suffix != "" for path in paths): raise ArtifactWithoutExtension match = any(path.suffix.endswith(tuple(extensions)) for path in paths) if match: return match # Some false negative can happen (e.g. https:///path/0.1.5)), so make sure # to catch those name = Path(urlparsed.path).name if not PATTERN_VERSION.match(name): return match if raise_when_no_extension: raise ArtifactWithoutExtension return False def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: """Determine whether a list of files actually are tarballs or simple files. When this cannot be answered simply out of the url, when request is provided, this executes a HTTP `HEAD` query on the url to determine the information. If request is not provided, this raises an ArtifactNatureUndetected exception. Args: urls: name of the remote files for which the extension needs to be checked. Raises: ArtifactNatureUndetected when the artifact's nature cannot be detected out of its url ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to the caller to do what's right with it. Returns: A tuple (bool, url). The boolean represents whether the url is an archive or not. The second parameter is the actual url once the head request is issued as a fallback of not finding out whether the urls are tarballs or not. """ def _is_tarball(url): """Determine out of an extension whether url is a tarball. Raises: ArtifactWithoutExtension in case no extension is available """ urlparsed = urlparse(url) if urlparsed.scheme not in ("http", "https", "ftp"): raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") return url_endswith(urlparsed, TARBALL_EXTENSIONS) index = random.randrange(len(urls)) url = urls[index] try: return _is_tarball(url), urls[0] except ArtifactWithoutExtension: if request is None: raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" ) logger.warning( "Cannot detect extension for <%s>. Fallback to http head query", url, ) try: response = request.head(url) except (InvalidSchema, SSLError, ConnectionError): raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" ) if not response.ok or response.status_code == 404: raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" ) location = response.headers.get("Location") if location: # It's not always present logger.debug("Location: %s", location) try: # FIXME: location is also returned as it's considered the true origin, # true enough? return _is_tarball(location), location except ArtifactWithoutExtension: logger.warning( "Still cannot detect extension through location <%s>...", url, ) origin = urls[0] content_type = response.headers.get("Content-Type") if content_type: logger.debug("Content-Type: %s", content_type) if content_type == "application/json": return False, origin return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin content_disposition = response.headers.get("Content-Disposition") if content_disposition: logger.debug("Content-Disposition: %s", content_disposition) if "filename=" in content_disposition: fields = content_disposition.split("; ") for field in fields: if "filename=" in field: _, filename = field.split("filename=") break return ( url_endswith( urlparse(filename), TARBALL_EXTENSIONS, raise_when_no_extension=False, ), origin, ) raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" ) VCS_KEYS_MAPPING = { "git": { "ref": "git_ref", "url": "git_url", }, "svn": { "ref": "svn_revision", "url": "svn_url", }, "hg": { "ref": "hg_changeset", "url": "hg_url", }, } class NixGuixLister(StatelessLister[PageResult]): """List Guix or Nix sources out of a public json manifest. This lister can output: - unique tarball (.tar.gz, .tbz2, ...) - vcs repositories (e.g. git, hg, svn) - unique file (.lisp, .py, ...) Note that no `last_update` is available in either manifest. For `url` types artifacts, this tries to determine the artifact's nature, tarball or file. It first tries to compute out of the "url" extension. In case of no extension, it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` response header, and then checks the extension again. Optionally, when the `extension_to_ignore` parameter is provided, it extends the default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed. This can be used to drop further binary files detected in the wild. """ LISTER_NAME = "nixguix" def __init__( self, scheduler, url: str, origin_upstream: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, # canonicalize urls, can be turned off during docker runs canonicalize: bool = True, extensions_to_ignore: List[str] = [], **kwargs: Any, ): super().__init__( scheduler=scheduler, url=url.rstrip("/"), instance=instance, credentials=credentials, with_github_session=canonicalize, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? self.origin_upstream = origin_upstream self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore self.session = requests.Session() def build_artifact( self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None ) -> Optional[Tuple[ArtifactType, VCS]]: """Build a canonicalized vcs artifact when possible.""" origin = ( self.github_session.get_canonical_url(artifact_url) if self.github_session else artifact_url ) if not origin: return None return ArtifactType.VCS, VCS( origin=origin, type=artifact_type, ref=artifact_ref ) def get_pages(self) -> Iterator[PageResult]: """Yield one page per "typed" origin referenced in manifest.""" # fetch and parse the manifest... response = self.http_request(self.url) # ... if any raw_data = response.json() yield ArtifactType.VCS, VCS(origin=self.origin_upstream, type="git") # grep '"type"' guix-sources.json | sort | uniq # "type": false <<<<<<<<< noise # "type": "git", # "type": "hg", # "type": "no-origin", <<<<<<<<< noise # "type": "svn", # "type": "url", # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq # "type": "url", sources = raw_data["sources"] random.shuffle(sources) for artifact in sources: artifact_type = artifact["type"] if artifact_type in VCS_SUPPORTED: plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] built_artifact = self.build_artifact( plain_url, artifact_type, plain_ref ) if not built_artifact: continue yield built_artifact elif artifact_type == "url": # It's either a tarball or a file origin_urls = artifact.get("urls") if not origin_urls: # Nothing to fetch logger.warning("Skipping url <%s>: empty artifact", artifact) continue assert origin_urls is not None # Deal with urls with empty scheme (basic fallback to http) urls = [] for url in origin_urls: urlparsed = urlparse(url) if urlparsed.scheme == "" and not re.match(r"^\w+@[^/]+:", url): logger.warning("Missing scheme for <%s>: fallback to http", url) fixed_url = f"http://{url}" else: fixed_url = url urls.append(fixed_url) origin, *fallback_urls = urls if origin.endswith(".git"): built_artifact = self.build_artifact(origin, "git") if not built_artifact: continue yield built_artifact continue outputHash = artifact.get("outputHash") integrity = artifact.get("integrity") if integrity is None and outputHash is None: logger.warning( "Skipping url <%s>: missing integrity and outputHash field", origin, ) continue # Falls back to outputHash field if integrity is missing if integrity is None and outputHash: # We'll deal with outputHash as integrity field integrity = outputHash try: is_tar, origin = is_tarball(urls, self.session) except ArtifactNatureMistyped: logger.warning( "Mistyped url <%s>: trying to deal with it properly", origin ) urlparsed = urlparse(origin) artifact_type = urlparsed.scheme if artifact_type in VCS_SUPPORTED: built_artifact = self.build_artifact(origin, artifact_type) if not built_artifact: continue yield built_artifact else: logger.warning( "Skipping url <%s>: undetected remote artifact type", origin ) continue except ArtifactNatureUndetected: logger.warning( "Skipping url <%s>: undetected remote artifact type", origin ) continue # Determine the content checksum stored in the integrity field and # convert into a dict of checksums. This only parses the # `hash-expression` (hash-) as defined in # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute try: chksum_algo, chksum_b64 = integrity.split("-") checksums: Dict[str, str] = { chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() } except binascii.Error: logger.exception( "Skipping url: <%s>: integrity computation failure for <%s>", url, artifact, ) continue # The 'outputHashMode' attribute determines how the hash is computed. It # must be one of the following two values: # - "flat": (default) The output must be a non-executable regular file. # If it isn’t, the build fails. The hash is simply computed over the # contents of that file (so it’s equal to what Unix commands like # `sha256sum` or `sha1sum` produce). # - "recursive": The hash is computed over the NAR archive dump of the # output (i.e., the result of `nix-store --dump`). In this case, # the output can be anything, including a directory tree. outputHashMode = artifact.get("outputHashMode", "flat") if not is_tar and outputHashMode == "recursive": # T4608: Cannot deal with those properly yet as some can be missing # 'critical' information about how to recompute the hash (e.g. fs # layout, executable bit, ...) logger.warning( "Skipping artifact <%s>: 'file' artifact of type <%s> is" " missing information to properly check its integrity", artifact, artifact_type, ) continue # At this point plenty of heuristics happened and we should have found # the right origin and its nature. # Let's check and filter it out if it is to be ignored (if possible). # Some origin urls may not have extension at this point (e.g # http://git.marmaro.de/?p=mmh;a=snp;h=;sf=tgz), let them through. if url_endswith( urlparse(origin), self.extensions_to_ignore, raise_when_no_extension=False, ): logger.warning( "Skipping artifact <%s>: 'file' artifact of type <%s> is" " ignored due to lister configuration. It should ignore" " origins with extension [%s]", origin, artifact_type, ",".join(self.extensions_to_ignore), ) continue logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) yield ArtifactType.ARTIFACT, Artifact( origin=origin, fallback_urls=fallback_urls, checksums=checksums, checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode], visit_type="directory" if is_tar else "content", ) else: logger.warning( "Skipping artifact <%s>: unsupported type %s", artifact, artifact_type, ) def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: """Given a vcs repository, yield a ListedOrigin.""" assert self.lister_obj.id is not None # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) yield ListedOrigin( lister_id=self.lister_obj.id, url=artifact.origin, visit_type=artifact.type, ) def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: """Given an artifact (tarball, file), yield one ListedOrigin.""" assert self.lister_obj.id is not None yield ListedOrigin( lister_id=self.lister_obj.id, url=artifact.origin, visit_type=artifact.visit_type, extra_loader_arguments={ "checksums": artifact.checksums, "checksums_computation": artifact.checksums_computation.value, "fallback_urls": artifact.fallback_urls, }, ) def get_origins_from_page( self, artifact_tuple: PageResult ) -> Iterator[ListedOrigin]: """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" artifact_type, artifact = artifact_tuple mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin") yield from mapping_type_fn(artifact) diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index b940699..f10c02d 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,170 +1,176 @@ # Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 from swh.lister.pattern import CredentialsType, Lister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @dataclass class NpmListerState: """State of npm lister""" last_seq: Optional[int] = None class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): """ List all packages hosted on the npm registry. The lister is based on the npm replication API powered by a CouchDB database (https://docs.couchdb.org/en/stable/api/database/). Args: scheduler: a scheduler instance page_size: number of packages info to return per page when querying npm API incremental: defines if incremental listing should be used, in that case only modified or new packages since last incremental listing operation will be returned, otherwise all packages will be listed in lexicographical order """ LISTER_NAME = "npm" INSTANCE = "npm" API_BASE_URL = "https://replicate.npmjs.com" API_INCREMENTAL_LISTING_URL = f"{API_BASE_URL}/_changes" API_FULL_LISTING_URL = f"{API_BASE_URL}/_all_docs" PACKAGE_URL_TEMPLATE = "https://www.npmjs.com/package/{package_name}" def __init__( self, scheduler: SchedulerInterface, page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_INCREMENTAL_LISTING_URL if incremental else self.API_FULL_LISTING_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.page_size = page_size if not incremental: # in full listing mode, first package in each page corresponds to the one # provided as the startkey query parameter value, so we increment the page # size by one to avoid double package processing self.page_size += 1 self.incremental = incremental self.session.headers.update({"Accept": "application/json"}) def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState: return NpmListerState(**d) def state_to_dict(self, state: NpmListerState) -> Dict[str, Any]: return asdict(state) def request_params(self, last_package_id: str) -> Dict[str, Any]: # include package JSON document to get its last update date params = {"limit": self.page_size, "include_docs": "true"} if self.incremental: params["since"] = last_package_id else: params["startkey"] = last_package_id return params def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_package_id: str = "0" if self.incremental else '""' if ( self.incremental and self.state is not None and self.state.last_seq is not None ): last_package_id = str(self.state.last_seq) while True: response = self.http_request( self.url, params=self.request_params(last_package_id) ) data = response.json() page = data["results"] if self.incremental else data["rows"] if not page: break if self.incremental or len(page) < self.page_size: yield page else: yield page[:-1] if len(page) < self.page_size: break last_package_id = ( str(page[-1]["seq"]) if self.incremental else f'"{page[-1]["id"]}"' ) def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of Npm repositories into a list of ListedOrigin.""" assert self.lister_obj.id is not None for package in page: # no source code to archive here if not package["doc"].get("versions", {}): continue package_name = package["doc"]["name"] package_latest_version = ( package["doc"].get("dist-tags", {}).get("latest", "") ) last_update = None if package_latest_version in package["doc"].get("time", {}): last_update = iso8601.parse_date( package["doc"]["time"][package_latest_version] ) yield ListedOrigin( lister_id=self.lister_obj.id, url=self.PACKAGE_URL_TEMPLATE.format(package_name=package_name), visit_type="npm", last_update=last_update, ) def commit_page(self, page: List[Dict[str, Any]]): """Update the currently stored state using the latest listed page.""" if self.incremental: last_package = page[-1] last_seq = last_package["seq"] if self.state.last_seq is None or last_seq > self.state.last_seq: self.state.last_seq = last_seq def finalize(self): if self.incremental and self.state.last_seq is not None: scheduler_state = self.get_state_from_scheduler() if ( scheduler_state.last_seq is None or self.state.last_seq > scheduler_state.last_seq ): self.updated = True diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py index 54a6c22..98f9fc9 100644 --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -1,158 +1,164 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, List, Optional from bs4 import BeautifulSoup import iso8601 from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. NugetListerPage = List[Dict[str, str]] @dataclass class NugetListerState: """Store lister state for incremental mode operations""" last_listing_date: Optional[datetime] = None """Last date from main http api endpoint when lister was executed""" class NugetLister(Lister[NugetListerState, NugetListerPage]): """List Nuget (Package manager for .NET) origins.""" LISTER_NAME = "nuget" INSTANCE = "nuget" API_INDEX_URL = "https://api.nuget.org/v3/catalog0/index.json" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_INDEX_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.listing_date: Optional[datetime] = None def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState: last_listing_date = d.get("last_listing_date") if last_listing_date is not None: d["last_listing_date"] = iso8601.parse_date(last_listing_date) return NugetListerState(**d) def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"last_listing_date": None} last_listing_date = state.last_listing_date if last_listing_date is not None: d["last_listing_date"] = last_listing_date.isoformat() return d def get_pages(self) -> Iterator[NugetListerPage]: """Yield an iterator which returns 'page' It uses the following endpoint `https://api.nuget.org/v3/catalog0/index.json` to get a list of pages endpoint to iterate. """ index_response = self.http_request(url=self.url) index = index_response.json() assert "commitTimeStamp" in index self.listing_date = iso8601.parse_date(index["commitTimeStamp"]) assert "items" in index for page in index["items"]: assert page["@id"] assert page["commitTimeStamp"] commit_timestamp = iso8601.parse_date(page["commitTimeStamp"]) if ( not self.state.last_listing_date or commit_timestamp > self.state.last_listing_date ): try: page_response = self.http_request(url=page["@id"]) page_data = page_response.json() assert "items" in page_data yield page_data["items"] except HTTPError: logger.warning( "Failed to fetch page %s, skipping it from listing.", page["@id"], ) continue def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances. .NET packages are binary, dll, etc. We retrieve only packages for which we can find a vcs repository. To check if a vcs repository exists, we need for each entry in a page to retrieve a .nuspec file, which is a package metadata xml file, and search for a `repository` value. """ assert self.lister_obj.id is not None for elt in page: try: res = self.http_request(url=elt["@id"]) except HTTPError: logger.warning( "Failed to fetch page %s, skipping it from listing.", elt["@id"], ) continue data = res.json() pkgname = data["id"] nuspec_url = ( f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/" f"{data['version'].lower()}/{pkgname.lower()}.nuspec" ) try: res_metadata = self.http_request(url=nuspec_url) except HTTPError: logger.warning( "Failed to fetch nuspec file %s, skipping it from listing.", nuspec_url, ) continue xml = BeautifulSoup(res_metadata.content, "xml") repo = xml.find("repository") if repo and "url" in repo.attrs and "type" in repo.attrs: vcs_url = repo.attrs["url"] vcs_type = repo.attrs["type"] last_update = iso8601.parse_date(elt["commitTimeStamp"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=vcs_url, last_update=last_update, ) else: continue def finalize(self) -> None: self.state.last_listing_date = self.listing_date self.updated = True diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py index 724d198..6b54e66 100644 --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -1,144 +1,150 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import logging import os from subprocess import PIPE, Popen, call from typing import Any, Dict, Iterator, Optional from swh.lister.pattern import StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType logger = logging.getLogger(__name__) PageType = str class OpamLister(StatelessLister[PageType]): """ List all repositories hosted on an opam repository. On initialisation, we create an opam root, with no ocaml compiler (no switch) as we won't need it and it's costly. In this opam root, we add a single opam repository (url) and give it a name (instance). Then, to get pages, we just ask opam to list all the packages for our opam repository in our opam root. Args: url: base URL of an opam repository (for instance https://opam.ocaml.org) instance: string identifier for the listed repository """ # Part of the lister API, that identifies this lister LISTER_NAME = "opam" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, opam_root: str = "/tmp/opam/", ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.env = os.environ.copy() # Opam root folder is initialized in the :meth:`get_pages` method as no # side-effect should happen in the constructor to ease instantiation self.opam_root = opam_root def get_pages(self) -> Iterator[PageType]: # Initialize the opam root directory opam_init(self.opam_root, self.instance, self.url, self.env) # Actually list opam instance data proc = Popen( [ "opam", "list", "--all", "--no-switch", "--safe", "--repos", self.instance, "--root", self.opam_root, "--normalise", "--short", ], env=self.env, stdout=PIPE, ) if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): yield line.rstrip("\n") def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: """Convert a page of OpamLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None # a page is just a package name url = f"opam+{self.url}/packages/{page}/" yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="opam", url=url, last_update=None, extra_loader_arguments={ "opam_root": self.opam_root, "opam_instance": self.instance, "opam_url": self.url, "opam_package": page, }, ) def opam_init(opam_root: str, instance: str, url: str, env: Dict[str, Any]) -> None: """Initialize an opam_root folder. Args: opam_root: The opam root folder to initialize instance: Name of the opam repository to add or initialize url: The associated url of the opam repository to add or initialize env: The global environment to use for the opam command. Returns: None. """ if not os.path.exists(opam_root) or not os.listdir(opam_root): command = [ "opam", "init", "--reinit", "--bare", "--no-setup", "--root", opam_root, instance, url, ] else: # The repository exists and is populated, we just add another instance in the # repository. If it's already setup, it's a noop command = [ "opam", "repository", "add", "--root", opam_root, instance, url, ] # Actually execute the command call(command, env=env) diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index e9fa296..af57b55 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,176 +1,182 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime, timezone import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 import requests from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) PackagistPageType = List[str] @dataclass class PackagistListerState: """State of Packagist lister""" last_listing_date: Optional[datetime] = None """Last date when packagist lister was executed""" class PackagistLister(Lister[PackagistListerState, PackagistPageType]): """ List all Packagist projects and send associated origins to scheduler. The lister queries the Packagist API, whose documentation can be found at https://packagist.org/apidoc. For each package, its metadata are retrieved using Packagist API endpoints whose responses are served from static files, which are guaranteed to be efficient on the Packagist side (no dymamic queries). Furthermore, subsequent listing will send the "If-Modified-Since" HTTP header to only retrieve packages metadata updated since the previous listing operation in order to save bandwidth and return only origins which might have new released versions. """ LISTER_NAME = "Packagist" PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.PACKAGIST_PACKAGES_LIST_URL, instance="packagist", credentials=credentials, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) self.listing_date = datetime.now().astimezone(tz=timezone.utc) def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: last_listing_date = d.get("last_listing_date") if last_listing_date is not None: d["last_listing_date"] = iso8601.parse_date(last_listing_date) return PackagistListerState(**d) def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"last_listing_date": None} last_listing_date = state.last_listing_date if last_listing_date is not None: d["last_listing_date"] = last_listing_date.isoformat() return d def api_request(self, url: str) -> Any: response = self.http_request(url) # response is empty when status code is 304 return response.json() if response.status_code == 200 else {} def get_pages(self) -> Iterator[PackagistPageType]: """ Yield a single page listing all Packagist projects. """ yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: """ Iterate on all Packagist projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None # save some bandwidth by only getting packages metadata updated since # last listing if self.state.last_listing_date is not None: if_modified_since = self.state.last_listing_date.strftime( "%a, %d %b %Y %H:%M:%S GMT" ) self.session.headers["If-Modified-Since"] = if_modified_since # to ensure origins will not be listed multiple times origin_urls = set() for package_name in page: try: metadata = self.api_request( f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json" ) if not metadata.get("packages", {}): # package metadata not updated since last listing continue if package_name not in metadata["packages"]: # missing package metadata in response continue versions_info = metadata["packages"][package_name].values() except requests.HTTPError: # error when getting package metadata (usually 404 when a # package has been removed), skip it and process next package continue origin_url = None visit_type = None last_update = None # extract origin url for package, vcs type and latest release date for version_info in versions_info: origin_url = version_info.get("source", {}).get("url", "") if not origin_url: continue # can be git, hg or svn visit_type = version_info.get("source", {}).get("type", "") dist_time_str = version_info.get("time", "") if not dist_time_str: continue dist_time = iso8601.parse_date(dist_time_str) if last_update is None or dist_time > last_update: last_update = dist_time # skip package with already seen origin url or with missing required info if visit_type is None or origin_url is None or origin_url in origin_urls: continue if visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical # ones assert self.github_session is not None origin_url = ( self.github_session.get_canonical_url(origin_url) or origin_url ) # bitbucket closed its mercurial hosting service, those origins can not be # loaded into the archive anymore if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"): continue origin_urls.add(origin_url) logger.debug( "Found package %s last updated on %s", package_name, last_update ) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=visit_type, last_update=last_update, ) def finalize(self) -> None: self.state.last_listing_date = self.listing_date self.updated = True diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 4556178..651dc8e 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,165 +1,174 @@ # Copyright (C) 2019-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) PageType = List[Dict[str, Any]] class PhabricatorLister(StatelessLister[PageType]): """ List all repositories hosted on a Phabricator instance. Args: url: base URL of a phabricator forge (for instance https://forge.softwareheritage.org) instance: string identifier for the listed forge, URL network location will be used if not provided api_token: authentication token for Conduit API """ LISTER_NAME = "phabricator" API_REPOSITORY_PATH = "/api/diffusion.repository.search" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, api_token: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials + scheduler=scheduler, + url=urljoin(url, self.API_REPOSITORY_PATH), + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) if api_token is not None: self.api_token = api_token else: if not self.credentials: raise ValueError( f"No credentials found for phabricator instance {self.instance};" " Please set them in the lister configuration file." ) self.api_token = random.choice(self.credentials)["password"] def get_request_params(self, after: Optional[str]) -> Dict[str, str]: """Get the query parameters for the request.""" base_params = { # Stable order "order": "oldest", # Add all URIs to the response "attachments[uris]": "1", # API token from stored credentials "api.token": self.api_token, } if after is not None: base_params["after"] = after return base_params @staticmethod def filter_params(params: Dict[str, str]) -> Dict[str, str]: """Filter the parameters for debug purposes""" return { k: (v if k != "api.token" else "**redacted**") for k, v in params.items() } def get_pages(self) -> Iterator[PageType]: after: Optional[str] = None while True: params = self.get_request_params(after) response = self.http_request(self.url, method="POST", data=params) response_data = response.json() if response_data.get("result") is None: logger.warning( "Got unexpected response on %s: %s", response.url, response_data, ) break result = response_data["result"] yield result["data"] after = None if "cursor" in result and "after" in result["cursor"]: after = result["cursor"]["after"] if not after: logger.debug("Empty `after` cursor. All done") break def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for repo in page: url = get_repo_url(repo["attachments"]["uris"]["uris"]) if url is None: short_name: Optional[str] = None for field in "shortName", "name", "callsign": short_name = repo["fields"].get(field) if short_name: break logger.warning( "No valid url for repository [%s] (phid=%s)", short_name or repo["phid"], repo["phid"], ) continue yield ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=repo["fields"]["vcs"], # The "dateUpdated" field returned by the Phabricator API only refers to # the repository metadata; We can't use it for our purposes. last_update=None, ) def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[str]: """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) # type: Dict[str, Any] for uri in attachments: protocol = uri["fields"]["builtin"]["protocol"] url = uri["fields"]["uri"]["effective"] identifier = uri["fields"]["builtin"]["identifier"] if protocol in ("http", "https"): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ("https", "http"): if url.startswith(protocol): processed_urls[protocol]["undefined"] = url break for protocol in ["https", "http"]: for identifier in ["shortname", "callsign", "id", "undefined"]: if protocol in processed_urls and identifier in processed_urls[protocol]: return processed_urls[protocol][identifier] return None diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index fd1dc45..50e4f15 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,94 +1,100 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Iterator, List, Optional import iso8601 from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PubDevListerPage = List[str] class PubDevLister(StatelessLister[PubDevListerPage]): """List pub.dev (Dart, Flutter) origins.""" LISTER_NAME = "pubdev" VISIT_TYPE = "pubdev" INSTANCE = "pubdev" BASE_URL = "https://pub.dev/" PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names" PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}" ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package origins. The http api call get "{base_url}package-names" to retrieve a sorted list of all package names. There is only one page that list all origins url based on "{base_url}packages/{pkgname}" """ response = self.http_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url) ) yield response.json()["packages"] def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for pkgname in page: package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) try: response = self.http_request(url=package_info_url) except HTTPError: logger.warning( "Failed to fetch metadata for package %s, skipping it from listing.", pkgname, ) continue package_metadata = response.json() package_versions = package_metadata["versions"] last_published = max( package_version["published"] for package_version in package_versions ) origin_url = self.ORIGIN_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin_url, last_update=iso8601.parse_date(last_published), ) diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py index 39deecf..6e84b27 100644 --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -1,155 +1,161 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime, timedelta, timezone import logging from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PuppetListerPage = List[Dict[str, Any]] @dataclass class PuppetListerState: """Store lister state for incremental mode operations""" last_listing_date: Optional[datetime] = None """Last date when Puppet lister was executed""" class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): """The Puppet lister list origins from 'Puppet Forge'""" LISTER_NAME = "puppet" VISIT_TYPE = "puppet" INSTANCE = "puppet" BASE_URL = "https://forgeapi.puppet.com/" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Store the datetime the lister runs for incremental purpose self.listing_date = datetime.now() def state_from_dict(self, d: Dict[str, Any]) -> PuppetListerState: last_listing_date = d.get("last_listing_date") if last_listing_date is not None: d["last_listing_date"] = iso8601.parse_date(last_listing_date) return PuppetListerState(**d) def state_to_dict(self, state: PuppetListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"last_listing_date": None} last_listing_date = state.last_listing_date if last_listing_date is not None: d["last_listing_date"] = last_listing_date.isoformat() return d def get_pages(self) -> Iterator[PuppetListerPage]: """Yield an iterator which returns 'page' It request the http api endpoint to get a paginated results of modules, and retrieve a `next` url. It ends when `next` json value is `null`. Open Api specification for getModules endpoint: https://forgeapi.puppet.com/#tag/Module-Operations/operation/getModules """ # limit = 100 is the max value for pagination limit: int = 100 params: Dict[str, Any] = {"limit": limit} if self.state.last_listing_date: # Incremental mode filter query # To ensure we don't miss records between two lister runs `last_str`` must be # set with an offset of -15 hours, which is the lower timezone recorded in the # tzdb last_str = ( self.state.last_listing_date.astimezone(timezone(timedelta(hours=-15))) .date() .isoformat() ) params["with_release_since"] = last_str response = self.http_request(f"{self.BASE_URL}v3/modules", params=params) data: Dict[str, Any] = response.json() yield data["results"] while data["pagination"]["next"]: response = self.http_request( urljoin(self.BASE_URL, data["pagination"]["next"]) ) data = response.json() yield data["results"] def get_origins_from_page(self, page: PuppetListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None dt_parse_pattern = "%Y-%m-%d %H:%M:%S %z" for entry in page: last_update = datetime.strptime(entry["updated_at"], dt_parse_pattern) pkgname = entry["name"] owner = entry["owner"]["slug"] url = f"https://forge.puppet.com/modules/{owner}/{pkgname}" artifacts = [] for release in entry["releases"]: # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 checksums = {} if release["version"] == entry["current_release"]["version"]: # checksums are only available for current release for checksum in ("md5", "sha256"): checksums[checksum] = entry["current_release"][ f"file_{checksum}" ] else: # use file length as basic content check instead checksums["length"] = release["file_size"] artifacts.append( { "filename": release["file_uri"].split("/")[-1], "url": urljoin(self.BASE_URL, release["file_uri"]), "version": release["version"], "last_update": datetime.strptime( release["created_at"], dt_parse_pattern ).isoformat(), "checksums": checksums, } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, extra_loader_arguments={"artifacts": artifacts}, ) def finalize(self) -> None: self.state.last_listing_date = self.listing_date self.updated = True diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 443c21d..64f14fa 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,177 +1,183 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging from time import sleep from typing import Any, Dict, Iterator, List, Optional, Tuple from xmlrpc.client import Fault, ServerProxy from tenacity.before_sleep import before_sleep_log from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Type returned by the XML-RPC changelog call: # package, version, release timestamp, description, serial ChangelogEntry = Tuple[str, str, int, str, int] # Manipulated package updated type which is a subset information # of the ChangelogEntry type: package, max release date PackageUpdate = Tuple[str, datetime] # Type returned by listing a page of results PackageListPage = List[PackageUpdate] @dataclass class PyPIListerState: """State of PyPI lister""" last_serial: Optional[int] = None """Last seen serial when visiting the pypi instance""" def _if_rate_limited(retry_state) -> bool: """Custom tenacity retry predicate to handle xmlrpc client error: .. code:: xmlrpc.client.Fault: """ attempt = retry_state.outcome return attempt.failed and isinstance(attempt.exception(), Fault) def pypi_url(package_name: str) -> str: """Build pypi url out of a package name.""" return PyPILister.PACKAGE_URL.format(package_name=package_name) class PyPILister(Lister[PyPIListerState, PackageListPage]): """List origins from PyPI.""" LISTER_NAME = "pypi" INSTANCE = "pypi" # As of today only the main pypi.org is used PACKAGE_LIST_URL = "https://pypi.org/pypi" # XML-RPC url PACKAGE_URL = "https://pypi.org/project/{package_name}/" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.PACKAGE_LIST_URL, instance=self.INSTANCE, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # used as termination condition and if useful, becomes the new state when the # visit is done self.last_processed_serial: Optional[int] = None def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState: return PyPIListerState(last_serial=d.get("last_serial")) def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]: return asdict(state) @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_last_serial(self, client: ServerProxy) -> int: """Internal detail to allow throttling when calling the changelog last entry""" serial = client.changelog_last_serial() assert isinstance(serial, int) return serial @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_since_serial( self, client: ServerProxy, serial: int ) -> List[ChangelogEntry]: """Internal detail to allow throttling when calling the changelog listing""" sleep(1) # to avoid the initial warning about throttling return client.changelog_since_serial(serial) # type: ignore def get_pages(self) -> Iterator[PackageListPage]: """Iterate other changelog events per package, determine the max release date for that package and use that max release date as last_update. When the execution is done, this will also set the self.last_processed_serial attribute so we can finalize the state of the lister for the next visit. Yields: List of Tuple of (package-name, max release-date) """ client = ServerProxy(self.url) last_processed_serial = -1 if self.state.last_serial is not None: last_processed_serial = self.state.last_serial upstream_last_serial = self._changelog_last_serial(client) # Paginate through result of pypi, until we read everything while last_processed_serial < upstream_last_serial: updated_packages = defaultdict(list) for package, _, release_date, _, serial in self._changelog_since_serial( client, last_processed_serial ): updated_packages[package].append(release_date) # Compute the max serial so we can stop when done last_processed_serial = max(last_processed_serial, serial) # Returns pages of result to flush regularly yield [ ( pypi_url(package), datetime.fromtimestamp(max(release_dates)).replace( tzinfo=timezone.utc ), ) for package, release_dates in updated_packages.items() ] self.last_processed_serial = upstream_last_serial def get_origins_from_page( self, packages: PackageListPage ) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for origin, last_update in packages: yield ListedOrigin( lister_id=self.lister_obj.id, url=origin, visit_type="pypi", last_update=last_update, ) def finalize(self): """Finalize the visit state by updating with the new last_serial if updates actually happened. """ self.updated = ( self.state and self.state.last_serial and self.last_processed_serial and self.state.last_serial < self.last_processed_serial ) or (not self.state.last_serial and self.last_processed_serial) if self.updated: self.state.last_serial = self.last_processed_serial diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index 917a2d6..bb317ea 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -1,230 +1,236 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from datetime import timezone import gzip import logging import os import shutil import subprocess import tarfile import tempfile from typing import Any, Dict, Iterator, Optional, Tuple from bs4 import BeautifulSoup import psycopg2 from testing.postgresql import Postgresql from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) RubyGemsListerPage = Dict[str, Any] class RubyGemsLister(StatelessLister[RubyGemsListerPage]): """Lister for RubyGems.org, the Ruby community's gem hosting service. Instead of querying rubygems.org Web API, it uses gems data from the daily PostreSQL database dump of rubygems. It enables to gather all interesting info about a gem and its release artifacts (version number, download URL, checksums, release date) in an efficient way and without flooding rubygems Web API with numerous HTTP requests (as there is more than 187000 gems available on 07/10/2022). """ LISTER_NAME = "rubygems" VISIT_TYPE = "rubygems" INSTANCE = "rubygems" RUBY_GEMS_POSTGRES_DUMP_BASE_URL = ( "https://s3-us-west-2.amazonaws.com/rubygems-dumps" ) RUBY_GEMS_POSTGRES_DUMP_LIST_URL = ( f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql" ) RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem" RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}" RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = ( "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json" ) DB_NAME = "rubygems" DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_latest_dump_file(self) -> str: response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) xml = BeautifulSoup(response.content, "xml") contents = xml.find_all("Contents") return contents[-1].find("Key").text def create_rubygems_db( self, postgresql: Postgresql ) -> Tuple[str, psycopg2._psycopg.connection]: logger.debug("Creating rubygems database") db_dsn = postgresql.dsn() db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) db = psycopg2.connect(**db_dsn) db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) with db.cursor() as cursor: cursor.execute(f"CREATE DATABASE {self.DB_NAME}") db_dsn["database"] = self.DB_NAME db = psycopg2.connect(**db_dsn) db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) with db.cursor() as cursor: cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") return db_url, db def populate_rubygems_db(self, db_url: str): dump_file = self.get_latest_dump_file() dump_id = dump_file.split("/")[2] response = self.http_request(f"{self.url}/{dump_file}", stream=True) with tempfile.TemporaryDirectory() as temp_dir: logger.debug( "Downloading latest rubygems database dump: %s (%s bytes)", dump_id, response.headers["content-length"], ) dump_file = os.path.join(temp_dir, "rubygems_dump.tar") with open(dump_file, "wb") as dump: for chunk in response.iter_content(chunk_size=1024): dump.write(chunk) with tarfile.open(dump_file) as dump_tar: dump_tar.extractall(temp_dir) logger.debug("Populating rubygems database with dump %s", dump_id) # FIXME: make this work with -v ON_ERROR_STOP=1 psql = subprocess.Popen( ["psql", "--no-psqlrc", "-q", db_url], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) # passing value of gzip.open as stdin of subprocess.run makes the process # read raw data instead of decompressed data so we have to use a pipe with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql: shutil.copyfileobj(sql, psql.stdin) # type: ignore # denote end of read file psql.stdin.close() # type: ignore psql.wait() if psql.returncode != 0: assert psql.stdout for line in psql.stdout.readlines(): logger.warning("psql out: %s", line.decode().strip()) assert psql.stderr for line in psql.stderr.readlines(): logger.warning("psql err: %s", line.decode().strip()) raise ValueError( "Loading rubygems dump failed with exit code %s.", psql.returncode, ) def get_pages(self) -> Iterator[RubyGemsListerPage]: # spawn a temporary postgres instance (require initdb executable in environment) with Postgresql() as postgresql: db_url, db = self.create_rubygems_db(postgresql) self.populate_rubygems_db(db_url) with db.cursor() as cursor: cursor.execute("SELECT id, name from rubygems") for gem_id, gem_name in cursor.fetchall(): logger.debug("Processing gem named %s", gem_name) with db.cursor() as cursor_v: cursor_v.execute( "SELECT authors, built_at, number, sha256, size from versions " "where rubygem_id = %s", (gem_id,), ) versions = [ { "number": number, "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format( gem=gem_name, version=number ), "date": built_at.replace(tzinfo=timezone.utc), "authors": authors, "sha256": ( base64.decodebytes(sha256.encode()).hex() if sha256 else None ), "size": size, } for authors, built_at, number, sha256, size in cursor_v.fetchall() ] if versions: yield { "name": gem_name, "versions": versions, } def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None artifacts = [] rubygem_metadata = [] for version in page["versions"]: artifacts.append( { "version": version["number"], "filename": version["url"].split("/")[-1], "url": version["url"], "checksums": ( {"sha256": version["sha256"]} if version["sha256"] else {} ), "length": version["size"], } ) rubygem_metadata.append( { "version": version["number"], "date": version["date"].isoformat(), "authors": version["authors"], "extrinsic_metadata_url": ( self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format( gem=page["name"], version=version["number"] ) ), } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]), last_update=max(version["date"] for version in page["versions"]), extra_loader_arguments={ "artifacts": artifacts, "rubygem_metadata": rubygem_metadata, }, ) diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index ba8c412..234e198 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,430 +1,436 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass, field import datetime from enum import Enum import logging import re from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from xml.etree import ElementTree from bs4 import BeautifulSoup import iso8601 import lxml import requests from swh.core.api.classes import stream_results from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) class VcsNames(Enum): """Used to filter SourceForge tool names for valid VCS types""" # CVS projects are read-only CVS = "cvs" GIT = "git" SUBVERSION = "svn" MERCURIAL = "hg" BAZAAR = "bzr" VCS_NAMES = set(v.value for v in VcsNames.__members__.values()) @dataclass class SourceForgeListerEntry: vcs: VcsNames url: str last_modified: datetime.date SubSitemapNameT = str ProjectNameT = str # SourceForge only offers day-level granularity, which is good enough for our purposes LastModifiedT = datetime.date @dataclass class SourceForgeListerState: """Current state of the SourceForge lister in incremental runs""" """If the subsitemap does not exist, we assume a full run of this subsitemap is needed. If the date is the same, we skip the subsitemap, otherwise we request the subsitemap and look up every project's "last modified" date to compare against `ListedOrigins` from the database.""" subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field( default_factory=dict ) """Some projects (not the majority, but still meaningful) have no VCS for us to archive. We need to remember a mapping of their API URL to their "last modified" date so we don't keep querying them needlessly every time.""" empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict) SourceForgeListerPage = List[SourceForgeListerEntry] MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" # API resource endpoint for information about the given project. # # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe` # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" # Predictable URL for cloning (in the broad sense) a VCS registered for the project. # # Warning: does not apply to bzr repos, and Mercurial are http only, see use of this # constant below. # # `vcs`: VCS type, one of `VCS_NAMES` # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe`. # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses # `git` (https://git.code.sf.net/p/codeblocks/git). CLONE_URL_FORMAT = "https://{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" PROJ_URL_RE = re.compile( r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" ) # Mapping of `(namespace, project name)` to `last modified` date. ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT] class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge.""" # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Will hold the currently saved "last modified" dates to compare against our # requests. self._project_last_modified: Optional[ProjectsLastModifiedCache] = None self.session.headers.update({"Accept": "application/json"}) self.incremental = incremental def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: subsitemaps = { k: datetime.date.fromisoformat(v) for k, v in d.get("subsitemap_last_modified", {}).items() } empty_projects = { k: datetime.date.fromisoformat(v) for k, v in d.get("empty_projects", {}).items() } return SourceForgeListerState( subsitemap_last_modified=subsitemaps, empty_projects=empty_projects ) def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]: return { "subsitemap_last_modified": { k: v.isoformat() for k, v in state.subsitemap_last_modified.items() }, "empty_projects": { k: v.isoformat() for k, v in state.empty_projects.items() }, } def projects_last_modified(self) -> ProjectsLastModifiedCache: if not self.incremental: # No point in loading the previous results if we're doing a full run return {} if self._project_last_modified is not None: return self._project_last_modified # We know there will be at least that many origins stream = stream_results( self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000 ) listed_origins = dict() # Projects can have slashes in them if they're subprojects, but the # mointpoint (last component) cannot. url_match = re.compile( r".*\.code\.sf\.net/(?P[^/]+)/(?P.+)/.*" ) bzr_url_match = re.compile( r"http://(?P[^/]+).bzr.sourceforge.net/bzr/([^/]+)" ) cvs_url_match = re.compile( r"rsync://a.cvs.sourceforge.net/cvsroot/(?P.+)/([^/]+)" ) for origin in stream: url = origin.url match = url_match.match(url) if match is None: # Could be a bzr or cvs special endpoint bzr_match = bzr_url_match.match(url) cvs_match = cvs_url_match.match(url) matches = None if bzr_match is not None: matches = bzr_match.groupdict() elif cvs_match is not None: matches = cvs_match.groupdict() assert matches project = matches["project"] namespace = "p" # no special namespacing for bzr and cvs projects else: matches = match.groupdict() namespace = matches["namespace"] project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update assert last_modified is not None listed_origins[(namespace, project)] = last_modified.date() self._project_last_modified = listed_origins return listed_origins def get_pages(self) -> Iterator[SourceForgeListerPage]: """ SourceForge has a main XML sitemap that lists its sharded sitemaps for all projects. Each XML sub-sitemap lists project pages, which are not unique per project: a project can have a wiki, a home, a git, an svn, etc. For each unique project, we query an API endpoint that lists (among other things) the tools associated with said project, some of which are the VCS used. Subprojects are considered separate projects. Lastly we use the information of which VCS are used to build the predictable clone URL for any given VCS. """ sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_el is not None and last_modified_el.text is not None last_modified = datetime.date.fromisoformat(last_modified_el.text) location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None and location.text is not None sub_url = location.text if self.incremental: recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) if recorded_last_mod == last_modified: # The entire subsitemap hasn't changed, so none of its projects # have either, skip it. continue self.state.subsitemap_last_modified[sub_url] = last_modified subsitemap_contents = self.http_request(sub_url).text subtree = ElementTree.fromstring(subsitemap_contents) yield from self._get_pages_from_subsitemap(subtree) def get_origins_from_page( self, page: SourceForgeListerPage ) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: last_modified: str = str(hit.last_modified) last_update: datetime.datetime = iso8601.parse_date(last_modified) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, last_update=last_update, ) def _get_pages_from_subsitemap( self, subtree: ElementTree.Element ) -> Iterator[SourceForgeListerPage]: projects: Set[ProjectNameT] = set() for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_block is not None last_modified = last_modified_block.text location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None project_url = location.text assert project_url is not None match = PROJ_URL_RE.match(project_url) if match: matches = match.groupdict() namespace = matches["namespace"] if namespace == "projects": # These have a `p`-namespaced counterpart, use that instead continue project = matches["project"] rest = matches["rest"] if rest.count("/") > 1: # This is a subproject. There exists no sub-subprojects. subproject_name = rest.rsplit("/", 2)[0] project = f"{project}/{subproject_name}" prev_len = len(projects) projects.add(project) if prev_len == len(projects): # Already seen continue pages = self._get_pages_for_project(namespace, project, last_modified) if pages: yield pages else: logger.debug("Project '%s' does not have any VCS", project) else: # Should almost always match, let's log it # The only ones that don't match are mostly specialized one-off URLs. msg = "Project URL '%s' does not match expected pattern" logger.warning(msg, project_url) def _get_pages_for_project( self, namespace, project, last_modified ) -> SourceForgeListerPage: endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) empty_project_last_modified = self.state.empty_projects.get(endpoint) if empty_project_last_modified is not None: if last_modified == empty_project_last_modified.isoformat(): # Project has not changed, so is still empty, meaning it has # no VCS attached that we can archive. logger.debug(f"Project {namespace}/{project} is still empty") return [] if self.incremental: expected = self.projects_last_modified().get((namespace, project)) if expected is not None: if expected.isoformat() == last_modified: # Project has not changed logger.debug(f"Project {namespace}/{project} has not changed") return [] else: logger.debug(f"Project {namespace}/{project} was updated") else: msg = "New project during an incremental run: %s/%s" logger.debug(msg, namespace, project) try: res = self.http_request(endpoint).json() except requests.HTTPError: # We've already logged in `http_request` return [] tools = res.get("tools") if tools is None: # This rarely happens, on very old URLs logger.warning("Project '%s' does not have any tools", endpoint) return [] hits = [] for tool in tools: tool_name = tool["name"] if tool_name not in VCS_NAMES: continue if tool_name == VcsNames.CVS.value: # CVS projects are different from other VCS ones, they use the rsync # protocol, a list of modules needs to be fetched from an info page # and multiple origin URLs can be produced for a same project. cvs_info_url = f"http://{project}.cvs.sourceforge.net" try: response = self.http_request(cvs_info_url) except requests.HTTPError: logger.warning( "CVS info page could not be fetched, skipping project '%s'", project, ) continue else: bs = BeautifulSoup(response.text, features="html.parser") cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot" for text in [b.text for b in bs.find_all("b")]: match = re.search(rf".*/cvsroot/{project} co -P (.+)", text) if match is not None: module = match.group(1) if module != "Attic": url = f"{cvs_base_url}/{project}/{module}" hits.append( SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified, ) ) continue url = CLONE_URL_FORMAT.format( vcs=tool_name, namespace=namespace, project=project, mount_point=tool["mount_point"], ) if tool_name == VcsNames.MERCURIAL.value: # SourceForge does not yet support anonymous HTTPS cloning for Mercurial # See https://sourceforge.net/p/forge/feature-requests/727/ url = url.replace("https://", "http://") if tool_name == VcsNames.BAZAAR.value: # SourceForge has removed support for bzr and only keeps legacy projects # around at a separate (also not https) URL. Bzr projects are very rare # and a lot of them are 404 now. url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" try: response = self.http_request(url) if "To get this branch, use:" not in response.text: # If a bzr project has multiple branches, we need to extract their # names from the repository landing page and create one listed origin # per branch parser = lxml.etree.HTMLParser() tree = lxml.etree.fromstring(response.text, parser) # Get all tds with class 'autcell' tds = tree.xpath(".//td[contains(@class, 'autcell')]") for td in tds: branch = td.findtext("a") # If the td's parent contains Branch and # it has non-empty text: if td.xpath("..//img[@alt='Branch']") and branch: hits.append( SourceForgeListerEntry( vcs=VcsNames(tool_name), url=f"{url}/{branch}", last_modified=last_modified, ) ) continue except requests.HTTPError: logger.warning( "Bazaar repository page could not be fetched, skipping project '%s'", project, ) continue entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) hits.append(entry) if not hits: date = datetime.date.fromisoformat(last_modified) self.state.empty_projects[endpoint] = date else: self.state.empty_projects.pop(endpoint, None) return hits diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py index 4a55499..ce5cadf 100644 --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -1,123 +1,129 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] class TuleapLister(StatelessLister[RepoPage]): """List origins from Tuleap. Tuleap provides SVN and Git repositories hosting. Tuleap API getting started: https://tuleap.net/doc/en/user-guide/integration/rest.html Tuleap API reference: https://tuleap.net/api/explorer/ Using the API we first request a list of projects, and from there request their associated repositories individually. Everything is paginated, code uses throttling at the individual GET call level.""" LISTER_NAME = "tuleap" REPO_LIST_PATH = "/api" REPO_GIT_PATH = "plugins/git/" REPO_SVN_PATH = "plugins/svn/" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) @classmethod def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage: if repo_type == "git": prefix_url = TuleapLister.REPO_GIT_PATH else: prefix_url = TuleapLister.REPO_SVN_PATH rep = { "project": repo["name"], "type": repo_type, "uri": urljoin(url, f"{prefix_url}{repo['path']}"), "last_update_date": repo["last_update_date"], } return rep def _get_repositories(self, url_repo) -> List[Dict[str, Any]]: ret = self.http_request(url_repo) reps_list = ret.json()["repositories"] limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(ret.headers["X-PAGINATION-LIMIT"]) size = int(ret.headers["X-PAGINATION-SIZE"]) while offset < size: url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit) ret = self.http_request(url_offset).json() reps_list = reps_list + ret["repositories"] offset += limit return reps_list def get_pages(self) -> Iterator[RepoPage]: # base with trailing slash, path without leading slash for urljoin url_api: str = urljoin(self.url, self.REPO_LIST_PATH) url_projects = url_api + "/projects/" # Get the list of projects. response = self.http_request(url_projects) projects_list = response.json() limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(response.headers["X-PAGINATION-LIMIT"]) size = int(response.headers["X-PAGINATION-SIZE"]) while offset < size: url_offset = ( url_projects + "?offset=" + str(offset) + "&limit=" + str(limit) ) ret = self.http_request(url_offset).json() projects_list = projects_list + ret offset += limit # Get list of repositories for each project. for p in projects_list: p_id = p["id"] # Fetch Git repositories for project url_git = url_projects + str(p_id) + "/git" repos = self._get_repositories(url_git) for repo in repos: yield self.results_simplified(url_api, "git", repo) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Tuleap repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None yield ListedOrigin( lister_id=self.lister_obj.id, url=page["uri"], visit_type=page["type"], last_update=iso8601.parse_date(page["last_update_date"]), )