Page MenuHomeSoftware Heritage

data.py
No OneTemporary

# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import random
import time
from copy import deepcopy
from datetime import timedelta
from typing import Dict, List, Optional, Set
from swh.indexer.fossology_license import FossologyLicenseIndexer
from swh.indexer.mimetype import MimetypeIndexer
from swh.indexer.ctags import CtagsIndexer
from swh.indexer.storage import get_indexer_storage
from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS
from swh.model.model import Content, Directory, Origin, OriginVisit, OriginVisitStatus
from swh.loader.git.from_disk import GitLoaderFromArchive
from swh.search import get_search
from swh.storage.algos.dir_iterators import dir_iterator
from swh.storage.algos.snapshot import snapshot_get_latest
from swh.storage.interface import Sha1
from swh.storage.utils import now
from swh.web import config
from swh.web.browse.utils import (
get_mimetype_and_encoding_for_content,
prepare_content_for_display,
_re_encode_content,
)
from swh.web.common import service
# Module used to initialize data that will be provided as tests input
# Configuration for git loader
_TEST_LOADER_CONFIG = {
"storage": {"cls": "memory",},
"save_data": False,
"max_content_size": 100 * 1024 * 1024,
}
# Base content indexer configuration
_TEST_INDEXER_BASE_CONFIG = {
"storage": {"cls": "memory"},
"objstorage": {"cls": "memory", "args": {},},
"indexer_storage": {"cls": "memory", "args": {},},
}
def random_sha1():
return hash_to_hex(bytes(random.randint(0, 255) for _ in range(20)))
def random_sha256():
return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32)))
def random_blake2s256():
return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32)))
def random_content():
return {
"sha1": random_sha1(),
"sha1_git": random_sha1(),
"sha256": random_sha256(),
"blake2s256": random_blake2s256(),
}
# MimetypeIndexer with custom configuration for tests
class _MimetypeIndexer(MimetypeIndexer):
def parse_config_file(self, *args, **kwargs):
return {
**_TEST_INDEXER_BASE_CONFIG,
"tools": {
"name": "file",
"version": "1:5.30-1+deb9u1",
"configuration": {"type": "library", "debian-package": "python3-magic"},
},
}
# FossologyLicenseIndexer with custom configuration for tests
class _FossologyLicenseIndexer(FossologyLicenseIndexer):
def parse_config_file(self, *args, **kwargs):
return {
**_TEST_INDEXER_BASE_CONFIG,
"workdir": "/tmp/swh/indexer.fossology.license",
"tools": {
"name": "nomos",
"version": "3.1.0rc2-31-ga2cbb8c",
"configuration": {"command_line": "nomossa <filepath>",},
},
}
# CtagsIndexer with custom configuration for tests
class _CtagsIndexer(CtagsIndexer):
def parse_config_file(self, *args, **kwargs):
return {
**_TEST_INDEXER_BASE_CONFIG,
"workdir": "/tmp/swh/indexer.ctags",
"languages": {"c": "c"},
"tools": {
"name": "universal-ctags",
"version": "~git7859817b",
"configuration": {
"command_line": """ctags --fields=+lnz --sort=no --links=no """
"""--output-format=json <filepath>"""
},
},
}
# Lightweight git repositories that will be loaded to generate
# input data for tests
_TEST_ORIGINS = [
{
"type": "git",
"url": "https://github.com/wcoder/highlightjs-line-numbers.js",
"archives": [
"highlightjs-line-numbers.js.zip",
"highlightjs-line-numbers.js_visit2.zip",
],
},
{
"type": "git",
"url": "https://github.com/memononen/libtess2",
"archives": ["libtess2.zip"],
},
{
"type": "git",
"url": "repo_with_submodules",
"archives": ["repo_with_submodules.tgz"],
},
]
_contents = {}
def _add_extra_contents(storage, contents):
pbm_image_data = b"""P1
# PBM example
24 7
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0
0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0
0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0
0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"""
# add file with mimetype image/x-portable-bitmap in the archive content
pbm_content = Content.from_data(pbm_image_data)
storage.content_add([pbm_content])
contents.add(pbm_content.sha1)
# Tests data initialization
def _init_tests_data():
# To hold reference to the memory storage
storage = None
# Create search instance
search = get_search("memory")
search.initialize()
search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS)
# Load git repositories from archives
for origin in _TEST_ORIGINS:
for i, archive in enumerate(origin["archives"]):
if i > 0:
# ensure visit dates will be different when simulating
# multiple visits of an origin
time.sleep(1)
origin_repo_archive = os.path.join(
os.path.dirname(__file__), "resources/repos/%s" % archive
)
loader = GitLoaderFromArchive(
origin["url"],
archive_path=origin_repo_archive,
config=_TEST_LOADER_CONFIG,
)
if storage is None:
storage = loader.storage
else:
loader.storage = storage
loader.load()
ori = storage.origin_get([origin["url"]])[0]
origin.update(ori.to_dict()) # add an 'id' key if enabled
search.origin_update([{"url": origin["url"], "has_visits": True}])
for i in range(250):
url = "https://many.origins/%d" % (i + 1)
# storage.origin_add([{'url': url}])
storage.origin_add([Origin(url=url)])
search.origin_update([{"url": url, "has_visits": True}])
date = now()
visit = OriginVisit(origin=url, date=date, type="tar")
visit = storage.origin_visit_add([visit])[0]
visit_status = OriginVisitStatus(
origin=url,
visit=visit.visit,
date=date + timedelta(minutes=1),
status="full",
snapshot=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
)
storage.origin_visit_status_add([visit_status])
sha1s: Set[Sha1] = set()
directories = set()
revisions = set()
releases = set()
snapshots = set()
content_path = {}
# Get all objects loaded into the test archive
for origin in _TEST_ORIGINS:
snp = snapshot_get_latest(storage, origin["url"])
snapshots.add(hash_to_hex(snp.id))
for branch_name, branch_data in snp.branches.items():
target_type = branch_data.target_type.value
if target_type == "revision":
revisions.add(branch_data.target)
elif target_type == "release":
release = storage.release_get([branch_data.target])[0]
revisions.add(release.target)
releases.add(hash_to_hex(branch_data.target))
for rev_log in storage.revision_shortlog(set(revisions)):
rev_id = rev_log[0]
revisions.add(rev_id)
for rev in storage.revision_get(revisions):
dir_id = rev["directory"]
directories.add(hash_to_hex(dir_id))
for entry in dir_iterator(storage, dir_id):
if entry["type"] == "file":
sha1s.add(entry["sha1"])
content_path[entry["sha1"]] = "/".join(
[hash_to_hex(dir_id), entry["path"].decode("utf-8")]
)
elif entry["type"] == "dir":
directories.add(hash_to_hex(entry["target"]))
_add_extra_contents(storage, sha1s)
# Get all checksums for each content
result: List[Optional[Content]] = storage.content_get(list(sha1s))
contents: List[Dict] = []
for content in result:
assert content is not None
sha1 = hash_to_hex(content.sha1)
content_metadata = {
algo: hash_to_hex(getattr(content, algo)) for algo in DEFAULT_ALGORITHMS
}
path = ""
if content.sha1 in content_path:
path = content_path[content.sha1]
cnt_data = storage.content_get_data(content.sha1)
assert cnt_data is not None
mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data)
_, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data)
content_display_data = prepare_content_for_display(cnt_data, mimetype, path)
content_metadata.update(
{
"path": path,
"mimetype": mimetype,
"encoding": encoding,
"hljs_language": content_display_data["language"],
"data": content_display_data["content_data"],
}
)
_contents[sha1] = content_metadata
contents.append(content_metadata)
# Create indexer storage instance that will be shared by indexers
idx_storage = get_indexer_storage("memory", {})
# Add the empty directory to the test archive
storage.directory_add([Directory(entries=())])
# Return tests data
return {
"search": search,
"storage": storage,
"idx_storage": idx_storage,
"origins": _TEST_ORIGINS,
"contents": contents,
"directories": list(directories),
"releases": list(releases),
"revisions": list(map(hash_to_hex, revisions)),
"snapshots": list(snapshots),
"generated_checksums": set(),
}
def _init_indexers(tests_data):
# Instantiate content indexers that will be used in tests
# and force them to use the memory storages
indexers = {}
for idx_name, idx_class in (
("mimetype_indexer", _MimetypeIndexer),
("license_indexer", _FossologyLicenseIndexer),
("ctags_indexer", _CtagsIndexer),
):
idx = idx_class()
idx.storage = tests_data["storage"]
idx.objstorage = tests_data["storage"].objstorage
idx.idx_storage = tests_data["idx_storage"]
idx.register_tools(idx.config["tools"])
indexers[idx_name] = idx
return indexers
def get_content(content_sha1):
return _contents.get(content_sha1)
_tests_data = None
_current_tests_data = None
_indexer_loggers = {}
def get_tests_data(reset=False):
"""
Initialize tests data and return them in a dict.
"""
global _tests_data, _current_tests_data
if _tests_data is None:
_tests_data = _init_tests_data()
indexers = _init_indexers(_tests_data)
for (name, idx) in indexers.items():
# pytest makes the loggers use a temporary file; and deepcopy
# requires serializability. So we remove them, and add them
# back after the copy.
_indexer_loggers[name] = idx.log
del idx.log
_tests_data.update(indexers)
if reset or _current_tests_data is None:
_current_tests_data = deepcopy(_tests_data)
for (name, logger) in _indexer_loggers.items():
_current_tests_data[name].log = logger
return _current_tests_data
def override_storages(storage, idx_storage, search):
"""
Helper function to replace the storages from which archive data
are fetched.
"""
swh_config = config.get_config()
swh_config.update(
{"storage": storage, "indexer_storage": idx_storage, "search": search,}
)
service.storage = storage
service.idx_storage = idx_storage
service.search = search

File Metadata

Mime Type
text/x-python
Expires
Jul 4 2025, 6:23 PM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3372868

Event Timeline