diff --git a/assets/src/bundles/webapp/coverage.css b/assets/src/bundles/webapp/coverage.css new file mode 100644 --- /dev/null +++ b/assets/src/bundles/webapp/coverage.css @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2021 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU Affero General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +.swh-coverage { + padding-top: 0.3rem; + border: none; + overflow: visible; +} + +.swh-coverage a { + text-decoration: none; +} + +.swh-coverage-col { + padding-left: 10px; + padding-right: 10px; +} + +.swh-coverage-header { + padding-top: 0; + padding-bottom: 0; +} + +.swh-coverage-logo { + display: block; + width: 100%; + height: 50px; + margin-left: auto; + margin-right: auto; + object-fit: contain; + + /* polyfill for old browsers, see https://github.com/bfred-it/object-fit-images */ + font-family: "object-fit: contain;"; +} + +.swh-coverage-list { + width: 100%; + height: 320px; + border: none; +} + +.swh-coverage-chevron { + position: absolute; + right: 0; +} + +.swh-coverage .card-header .mdi { + transition: 0.3s transform ease-in-out; +} + +.swh-coverage .card-header .collapsed .mdi { + transform: rotate(90deg); +} + +.swh-coverage-info-body { + max-height: 150px; + overflow-y: auto; + overflow-x: hidden; + scrollbar-width: thin; /* Firefox only */ + padding: 0; +} + +/* Thin scrollbar for chromium based browsers */ + +.swh-coverage-info-body::-webkit-scrollbar { + width: 4px; +} + +.swh-coverage-info-body::-webkit-scrollbar-track { + background: #eff0f1; +} + +.swh-coverage-info-body::-webkit-scrollbar-thumb { + background: #909396; +} diff --git a/assets/src/bundles/webapp/index.js b/assets/src/bundles/webapp/index.js --- a/assets/src/bundles/webapp/index.js +++ b/assets/src/bundles/webapp/index.js @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2020 The Software Heritage developers + * Copyright (C) 2018-2021 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU Affero General Public License version 3, or any later version * See top-level LICENSE file for more information @@ -11,6 +11,7 @@ // global swh-web custom stylesheets import './webapp.css'; import './breadcrumbs.css'; +import './coverage.css'; export * from './webapp-utils'; diff --git a/assets/src/bundles/webapp/webapp.css b/assets/src/bundles/webapp/webapp.css --- a/assets/src/bundles/webapp/webapp.css +++ b/assets/src/bundles/webapp/webapp.css @@ -126,11 +126,11 @@ } .sitename .first-word { - font-family: 'Alegreya Sans', sans-serif; + font-family: "Alegreya Sans", sans-serif; } .sitename .second-word { - font-family: 'Alegreya', serif; + font-family: "Alegreya", serif; } .swh-counter { @@ -190,7 +190,7 @@ } .modal::before { - content: ''; + content: ""; display: inline-block; height: 100%; vertical-align: middle; @@ -221,7 +221,7 @@ a.dropdown-left::before { content: "\f035e"; - font-family: 'Material Design Icons'; + font-family: "Material Design Icons"; display: block; width: 20px; height: 20px; @@ -235,7 +235,13 @@ border-right-style: none; border-bottom-style: solid; border-bottom-width: 5px; - border-image: linear-gradient(to right, rgb(226, 0, 38) 0%, rgb(254, 205, 27) 100%) 1 1 1 1; + border-image: + linear-gradient( + to right, + rgb(226, 0, 38) 0%, + rgb(254, 205, 27) 100% + ) + 1 1 1 1; width: 100%; padding: 5px; margin-bottom: 10px; @@ -479,21 +485,26 @@ border: none; } -.swh-coverage-col { - padding-left: 10px; - padding-right: 10px; -} - .swh-coverage { - height: calc(65px + 1em); padding-top: 0.3rem; border: none; + overflow: visible; } .swh-coverage a { text-decoration: none; } +.swh-coverage-col { + padding-left: 10px; + padding-right: 10px; +} + +.swh-coverage-header { + padding-top: 0; + padding-bottom: 0; +} + .swh-coverage-logo { display: block; width: 100%; @@ -503,7 +514,7 @@ object-fit: contain; /* polyfill for old browsers, see https://github.com/bfred-it/object-fit-images */ - font-family: 'object-fit: contain;'; + font-family: "object-fit: contain;"; } .swh-coverage-list { @@ -512,6 +523,41 @@ border: none; } +.swh-coverage-chevron { + position: absolute; + right: 0; +} + +.swh-coverage .card-header .mdi { + transition: 0.3s transform ease-in-out; +} + +.swh-coverage .card-header .collapsed .mdi { + transform: rotate(90deg); +} + +.swh-coverage-info-body { + max-height: 150px; + overflow-y: auto; + overflow-x: hidden; + scrollbar-width: thin; /* Firefox only */ + padding: 0; +} + +/* Thin scrollbar for chromium based browsers */ + +.swh-coverage-info-body::-webkit-scrollbar { + width: 4px; +} + +.swh-coverage-info-body::-webkit-scrollbar-track { + background: #eff0f1; +} + +.swh-coverage-info-body::-webkit-scrollbar-thumb { + background: #909396; +} + tr.swh-tr-hover-highlight:hover td { background: #ededed; } diff --git a/static/img/logos/cgit.png b/static/img/logos/cgit.png new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ - - - - - - - - - - - - - - diff --git a/static/img/logos/elife.png b/static/img/logos/elife.png new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ - - - wm_no_bg - Created with Sketch. - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/static/img/logos/guix.png b/static/img/logos/guix.png new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - diff --git a/static/img/logos/launchpad.png b/static/img/logos/launchpad.png new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ \ No newline at end of file diff --git a/static/img/logos/sourceforge.png b/static/img/logos/sourceforge.png new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@listers implemented by Software Heritage.' + ), + "origins": [ + { + "type": "bitbucket", + "info_url": "https://bitbucket.org", + "info": "public repositories from Bitbucket", + "search_pattern": "https://bitbucket.org/", + }, + { + "type": "cgit", + "info_url": "https://git.zx2c4.com/cgit/about", + "info": "public repositories from cgit instances", + "search_pattern": "cgit", + }, + { + "type": "CRAN", + "info_url": "https://cran.r-project.org", + "info": "source packages from The Comprehensive R Archive Network", + "search_pattern": "https://cran.r-project.org/", + }, + { + "type": "debian", + "info_url": "https://www.debian.org", + "info": "source packages from the Debian distribution", + "search_pattern": "deb://", + }, + { + "type": "gitea", + "info_url": "https://gitea.io", + "info": "public repositories from Gitea instances", + "search_pattern": "gitea", + }, + { + "type": "github", + "info_url": "https://github.com", + "info": "public repositories from GitHub", + "search_pattern": "https://github.com/", + }, + { + "type": "gitlab", + "info_url": "https://gitlab.com", + "info": "public repositories from multiple GitLab instances", + "search_pattern": "gitlab", + }, + { + "type": "guix", + "info_url": "https://guix.gnu.org", + "info": "source code tarballs used to build the Guix package collection", + "visit_types": ["nixguix"], + "search_pattern": "https://guix.gnu.org/sources.json", + }, + { + "type": "GNU", + "info_url": "https://www.gnu.org", + "info": "releases from the GNU project (as of August 2015)", + "search_pattern": "gnu", + }, + { + "type": "launchpad", + "info_url": "https://launchpad.net", + "logo": "img/logos/launchpad.png", + "info": "public repositories from Launchpad", + "search_pattern": "https://git.launchpad.net/", + }, + { + "type": "nixos", + "info_url": "https://nixos.org", + "info": "source code tarballs used to build the Nix package collection", + "visit_types": ["nixguix"], + "search_pattern": ( + "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + ), + }, + { + "type": "npm", + "info_url": "https://www.npmjs.com", + "info": "public packages from the package registry for javascript", + "search_pattern": "https://www.npmjs.com", + }, + # apart our forge, most phabricator origins have not been archived + # while they have been listed so do not display those type of origins + # until new listing processes have been executed and origins loaded + # + # { + # "type": "phabricator", + # "info_url": "https://www.phacility.com/phabricator", + # "info": "public repositories from multiple Phabricator instances", + # "search_pattern": "phabricator", + # }, + { + "type": "pypi", + "info_url": "https://pypi.org", + "info": "source packages from the Python Package Index", + "search_pattern": "https://pypi.org", + }, + { + "type": "sourceforge", + "info_url": "https://sourceforge.net", + "info": "public repositories from SourceForge", + "search_pattern": "code.sf.net", + }, + ], +} + +legacy_origins = { + "info": ( + "Discontinued hosting services. Those origins have been archived " + "by Software Heritage." + ), + "origins": [ + { + "type": "gitorious", + "info_url": "https://en.wikipedia.org/wiki/Gitorious", + "info": ( + "public repositories from the former Gitorious code hosting service" + ), + "visit_types": ["git"], + "search_pattern": "https://gitorious.org", + "count": "122,014", + }, + { + "type": "googlecode", + "info_url": "https://code.google.com/archive", + "info": ( + "public repositories from the former Google Code project " + "hosting service" + ), + "visit_types": ["git", "hg", "svn"], + "search_pattern": "googlecode.com", + "count": "790,026", + }, + ], +} + +deposited_origins = { + "info": ( + "These origins are directly pushed into the archive by trusted partners " + f'using the deposit service of Software Heritage.' + ), + "origins": [ + { + "type": "elife", + "info_url": "https://elifesciences.org", + "info": ( + "research software source code associated to the articles " + "eLife publishes" + ), + "search_pattern": "elife.stencila.io", + "visit_types": ["deposit"], + }, + { + "type": "hal", + "info_url": "https://hal.archives-ouvertes.fr", + "info": "scientific software source code deposited in the open archive HAL", + "visit_types": ["deposit"], + "search_pattern": "hal.archives-ouvertes.fr", + }, + { + "type": "ipol", + "info_url": "https://www.ipol.im", + "info": "software artifacts associated to the articles IPOL publishes", + "visit_types": ["deposit"], + "search_pattern": "doi.org/10.5201", + }, + ], +} + + +@lru_cache() +def _get_listers_metrics() -> Dict[str, List[Tuple[str, SchedulerMetrics]]]: + """Returns scheduler metrics in the following mapping: + Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]] + as a lister instance has one SchedulerMetrics object per visit type. + """ + listers_metrics = defaultdict(list) + try: + listers = scheduler().get_listers() + scheduler_metrics = scheduler().get_metrics() + for lister in listers: + for metrics in filter( + lambda m: m.lister_id == lister.id, scheduler_metrics + ): + listers_metrics[lister.name].append((lister.instance_name, metrics)) + except Exception as e: + sentry_sdk.capture_exception(e) + return listers_metrics + + +@lru_cache() +def _get_deposits_netloc_counts() -> Counter: + """Return deposit counts per origin url network location. + """ + + def _process_origin_url(origin_url): + parsed_url = urlparse(origin_url) + netloc = parsed_url.netloc + # special treatment for doi.org netloc as it is not specific enough + # for origins mapping + if parsed_url.netloc == "doi.org": + netloc += "/" + parsed_url.path.split("/")[1] + return netloc + + netlocs = [] + try: + deposits = get_deposits_list() + netlocs = [ + _process_origin_url(d["origin_url"]) + for d in deposits + if d["status"] == "done" + ] + except Exception as e: + sentry_sdk.capture_exception(e) + return Counter(netlocs) + + +@lru_cache() +def _get_nixguix_origins_count(origin_url: str) -> int: + """Returns number of archived tarballs for NixOS, aka the number + of branches in a dedicated origin in the archive. + """ + snapshot = archive.lookup_latest_origin_snapshot(origin_url) + if snapshot: + snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"]) + return snapshot_sizes["revision"] + else: + return 0 + + +def _search_url(query: str, visit_type: str) -> str: + return reverse( + "browse-search", + query_params={ + "q": query, + "visit_type": visit_type, + "with_visit": "true", + "with_content": "true", + }, + ) @xframe_options_exempt def _swh_coverage(request): - count_origins = get_config()["coverage_count_origins"] + listers_metrics = _get_listers_metrics() + for origins in listed_origins["origins"]: + origins["instances"] = {} + origins_type = origins["type"] + + # special processing for nixos/guix origins as there is no + # scheduler metrics for those + if origins_type in ("nixos", "guix"): + count = _get_nixguix_origins_count(origins["search_pattern"]) + origins["count"] = count + origins["instances"][origins_type] = {"nixguix": {"count": count}} + + if origins_type not in listers_metrics: + continue + + count = sum( + [metrics.origins_known for _, metrics in listers_metrics[origins_type]] + ) + count_never_visited = sum( + [ + metrics.origins_never_visited + for _, metrics in listers_metrics[origins_type] + ] + ) + # CRAN origins are currently marked as not visited while they have been + if origins_type != "CRAN": + count -= count_never_visited + + origins["count"] = f"{count:,}" + origins["instances"] = defaultdict(dict) + for instance, metrics in listers_metrics[origins_type]: + # not yet in production + if metrics.visit_type in ("bzr", "cvs"): + continue + origins["instances"][instance].update( + { + metrics.visit_type: { + "count": metrics.origins_known - metrics.origins_never_visited + } + } + ) + origins["visit_types"] = list( + set(origins["instances"][instance].keys()) + | set(origins.get("visit_types", [])) + ) + + if origins_type == "CRAN": + origins["instances"]["cran"]["cran"] = {"count": origins["count"]} + + # defaultdict cannot be iterated in django template + origins["instances"] = dict(origins["instances"]) + + for origins in listed_origins["origins"]: + instances = origins["instances"] + nb_instances = len(instances) + for instance_name, visit_types in instances.items(): + for visit_type in visit_types: + if nb_instances > 1: + search_pattern = instance_name + else: + search_pattern = origins["search_pattern"] + search_url = _search_url(search_pattern, visit_type) + visit_types[visit_type]["search_url"] = search_url + + for origins in legacy_origins["origins"]: + origins["search_urls"] = {} + for visit_type in origins["visit_types"]: + origins["search_urls"][visit_type] = _search_url( + origins["search_pattern"], visit_type + ) + + deposits_counts = _get_deposits_netloc_counts() + for origins in deposited_origins["origins"]: + if origins["search_pattern"] in deposits_counts: + origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}" + origins["search_urls"] = { + "deposit": _search_url(origins["search_pattern"], "deposit") + } + return render( request, "misc/coverage.html", - {"providers": _code_providers, "count_origins": count_origins}, + { + "origins": { + "Regular crawling": listed_origins, + "Discontinued hosting": legacy_origins, + "On demand archival": deposited_origins, + } + }, ) diff --git a/swh/web/templates/homepage.html b/swh/web/templates/homepage.html --- a/swh/web/templates/homepage.html +++ b/swh/web/templates/homepage.html @@ -55,10 +55,6 @@

Content

-

- A significant amount of source code has already been ingested in the Software Heritage - archive. It currently includes: -

diff --git a/swh/web/templates/misc/coverage.html b/swh/web/templates/misc/coverage.html --- a/swh/web/templates/misc/coverage.html +++ b/swh/web/templates/misc/coverage.html @@ -1,5 +1,5 @@ {% comment %} -Copyright (C) 2015-2019 The Software Heritage developers +Copyright (C) 2015-2021 The Software Heritage developers See the AUTHORS file at the top-level directory of this distribution License: GNU Affero General Public License version 3, or any later version See top-level LICENSE file for more information @@ -23,7 +23,7 @@ /* @licstart The following is the entire license notice for the JavaScript code in this page. -Copyright (C) 2015-2019 The Software Heritage developers +Copyright (C) 2015-2021 The Software Heritage developers This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -49,43 +49,103 @@
-
- {% for provider in providers %} -
-
- - - -
+

+ A significant amount of source code has already been ingested in the Software Heritage + archive. It notably includes the following software origins. +

+ {% for origins_type, origins_data in origins.items %} +
{{ origins_type }}
+

{{ origins_data.info | safe }}

+
+ {% for origins in origins_data.origins %} +
+
+ + {% with 'img/logos/'|add:origins.type.lower|add:'.png' as png_logo %} + + {% endwith %} + + +
+
+ + {% if "instances" in origins %} + + + + + + + + + + {% for instance, visit_types in origins.instances.items %} + {% for visit_type, data in visit_types.items %} + {% if data.count %} + + + + + + + {% endif %} + {% endfor %} + {% endfor %} + + {% else %} + + + + + + + + + {% for visit_type, search_url in origins.search_urls.items %} + + + + + + {% endfor %} + + {% endif %} +
instancetypecountsearch
{{ instance }}{{ visit_type }}{{ data.count }} + + + +
instancetypesearch
{{ origins.type }}{{ visit_type }} + + + +
+
+
+
-
- {% endfor %} -
+ {% endfor %} +
+ {% endfor %}
JavaScript license information - {% if count_origins %} - - {% endif %} diff --git a/swh/web/tests/misc/test_coverage.py b/swh/web/tests/misc/test_coverage.py new file mode 100644 --- /dev/null +++ b/swh/web/tests/misc/test_coverage.py @@ -0,0 +1,133 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +from itertools import chain +import os +from random import choice, randint +import uuid + +import pytest + +from django.conf import settings +from django.utils.html import escape + +from swh.scheduler.model import LastVisitStatus, ListedOrigin, OriginVisitStats +from swh.web.common.utils import reverse +from swh.web.misc.coverage import ( + _get_deposits_netloc_counts, + _get_listers_metrics, + deposited_origins, + legacy_origins, + listed_origins, +) +from swh.web.tests.django_asserts import assert_contains +from swh.web.tests.utils import check_html_get_response + + +@pytest.fixture(autouse=True) +def clear_lru_caches(): + _get_listers_metrics.cache_clear() + _get_deposits_netloc_counts.cache_clear() + + +def test_coverage_view_no_metrics(client): + """ + Check coverage view can be rendered when scheduler metrics and deposits + data are not available. + """ + url = reverse("swh-coverage") + check_html_get_response( + client, url, status_code=200, template_used="misc/coverage.html" + ) + + +def test_coverage_view_with_metrics(client, swh_scheduler, mocker): + """ + Generate some sample scheduler metrics and some sample deposits + that will be consumed by the archive coverage view, then check + the HTML page gets rendered without errors. + """ + mocker.patch( + "swh.web.misc.coverage._get_nixguix_origins_count" + ).return_value = 30095 + listers = [] + for origins in listed_origins["origins"]: + # create some instances for each lister + for instance in range(randint(1, 5)): + lister = swh_scheduler.get_or_create_lister( + origins["type"], f"instance-{instance}" + ) + listers.append(lister) + # record some sample listed origins + _origins = [] + origin_visit_stats = [] + for i in range(randint(3, 10)): + url = str(uuid.uuid4()) + visit_type = choice(["git", "hg", "svn"]) + _origins.append( + ListedOrigin( + lister_id=lister.id, + url=url, + visit_type=visit_type, + extra_loader_arguments={}, + ) + ) + # set origin visit stats to some origins + if i % 2 == 0: + now = datetime.now(tz=timezone.utc) + origin_visit_stats.append( + OriginVisitStats( + url=url, + visit_type=visit_type, + last_successful=now, + last_visit=now, + last_visit_status=LastVisitStatus.successful, + last_snapshot=os.urandom(20), + ) + ) + # send origins data to scheduler + swh_scheduler.record_listed_origins(_origins) + swh_scheduler.origin_visit_stats_upsert(origin_visit_stats) + + # compute scheduler metrics + swh_scheduler.update_metrics() + + # add some sample deposits + deposits = [] + for origins in deposited_origins["origins"]: + for _ in range(randint(2, 10)): + deposits.append( + { + "origin_url": f"https://{origins['search_pattern']}/{uuid.uuid4()}", + "status": "done", + } + ) + get_deposits_list = mocker.patch("swh.web.misc.coverage.get_deposits_list") + get_deposits_list.return_value = deposits + + # check view gets rendered without errors + url = reverse("swh-coverage") + resp = check_html_get_response( + client, url, status_code=200, template_used="misc/coverage.html" + ) + + # check logos and origins search links are present in the rendered page + for origins in chain( + listed_origins["origins"], + legacy_origins["origins"], + deposited_origins["origins"], + ): + logo_url = f'{settings.STATIC_URL}img/logos/{origins["type"].lower()}.png' + assert_contains(resp, f'src="{logo_url}"') + + if "instances" in origins: + for visit_types in origins["instances"].values(): + for data in visit_types.values(): + if data["count"]: + assert_contains(resp, f'