data.py
No OneTemporary
Actions

Size

11 KB

Subscribers

None

data.py
View Options

	# Copyright (C) 2018-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU Affero General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import os
	import random
	import time

	from copy import deepcopy
	from datetime import timedelta
	from typing import Dict, List, Optional, Set

	from swh.indexer.fossology_license import FossologyLicenseIndexer
	from swh.indexer.mimetype import MimetypeIndexer
	from swh.indexer.ctags import CtagsIndexer
	from swh.indexer.storage import get_indexer_storage
	from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS
	from swh.model.model import Content, Directory, Origin, OriginVisit, OriginVisitStatus
	from swh.loader.git.from_disk import GitLoaderFromArchive
	from swh.search import get_search
	from swh.storage.algos.dir_iterators import dir_iterator
	from swh.storage.algos.snapshot import snapshot_get_latest
	from swh.storage.interface import Sha1
	from swh.storage.utils import now
	from swh.web import config
	from swh.web.browse.utils import (
	get_mimetype_and_encoding_for_content,
	prepare_content_for_display,
	_re_encode_content,
	)
	from swh.web.common import service

	# Module used to initialize data that will be provided as tests input

	# Configuration for git loader
	_TEST_LOADER_CONFIG = {
	"storage": {"cls": "memory",},
	"save_data": False,
	"max_content_size": 100 * 1024 * 1024,
	}

	# Base content indexer configuration
	_TEST_INDEXER_BASE_CONFIG = {
	"storage": {"cls": "memory"},
	"objstorage": {"cls": "memory", "args": {},},
	"indexer_storage": {"cls": "memory", "args": {},},
	}


	def random_sha1():
	return hash_to_hex(bytes(random.randint(0, 255) for _ in range(20)))


	def random_sha256():
	return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32)))


	def random_blake2s256():
	return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32)))


	def random_content():
	return {
	"sha1": random_sha1(),
	"sha1_git": random_sha1(),
	"sha256": random_sha256(),
	"blake2s256": random_blake2s256(),
	}


	# MimetypeIndexer with custom configuration for tests
	class _MimetypeIndexer(MimetypeIndexer):
	def parse_config_file(self, args, *kwargs):
	return {
	**_TEST_INDEXER_BASE_CONFIG,
	"tools": {
	"name": "file",
	"version": "1:5.30-1+deb9u1",
	"configuration": {"type": "library", "debian-package": "python3-magic"},
	},
	}


	# FossologyLicenseIndexer with custom configuration for tests
	class _FossologyLicenseIndexer(FossologyLicenseIndexer):
	def parse_config_file(self, args, *kwargs):
	return {
	**_TEST_INDEXER_BASE_CONFIG,
	"workdir": "/tmp/swh/indexer.fossology.license",
	"tools": {
	"name": "nomos",
	"version": "3.1.0rc2-31-ga2cbb8c",
	"configuration": {"command_line": "nomossa <filepath>",},
	},
	}


	# CtagsIndexer with custom configuration for tests
	class _CtagsIndexer(CtagsIndexer):
	def parse_config_file(self, args, *kwargs):
	return {
	**_TEST_INDEXER_BASE_CONFIG,
	"workdir": "/tmp/swh/indexer.ctags",
	"languages": {"c": "c"},
	"tools": {
	"name": "universal-ctags",
	"version": "~git7859817b",
	"configuration": {
	"command_line": """ctags --fields=+lnz --sort=no --links=no """
	"""--output-format=json <filepath>"""
	},
	},
	}


	# Lightweight git repositories that will be loaded to generate
	# input data for tests
	_TEST_ORIGINS = [
	{
	"type": "git",
	"url": "https://github.com/wcoder/highlightjs-line-numbers.js",
	"archives": [
	"highlightjs-line-numbers.js.zip",
	"highlightjs-line-numbers.js_visit2.zip",
	],
	},
	{
	"type": "git",
	"url": "https://github.com/memononen/libtess2",
	"archives": ["libtess2.zip"],
	},
	{
	"type": "git",
	"url": "repo_with_submodules",
	"archives": ["repo_with_submodules.tgz"],
	},
	]

	_contents = {}


	def _add_extra_contents(storage, contents):
	pbm_image_data = b"""P1
	# PBM example
	24 7
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0
	0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0
	0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0
	0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
	0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"""

	# add file with mimetype image/x-portable-bitmap in the archive content
	pbm_content = Content.from_data(pbm_image_data)
	storage.content_add([pbm_content])
	contents.add(pbm_content.sha1)


	# Tests data initialization
	def _init_tests_data():
	# To hold reference to the memory storage
	storage = None

	# Create search instance
	search = get_search("memory")
	search.initialize()
	search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS)

	# Load git repositories from archives
	for origin in _TEST_ORIGINS:
	for i, archive in enumerate(origin["archives"]):
	if i > 0:
	# ensure visit dates will be different when simulating
	# multiple visits of an origin
	time.sleep(1)
	origin_repo_archive = os.path.join(
	os.path.dirname(__file__), "resources/repos/%s" % archive
	)
	loader = GitLoaderFromArchive(
	origin["url"],
	archive_path=origin_repo_archive,
	config=_TEST_LOADER_CONFIG,
	)
	if storage is None:
	storage = loader.storage
	else:
	loader.storage = storage
	loader.load()

	ori = storage.origin_get([origin["url"]])[0]
	origin.update(ori.to_dict()) # add an 'id' key if enabled
	search.origin_update([{"url": origin["url"], "has_visits": True}])

	for i in range(250):
	url = "https://many.origins/%d" % (i + 1)
	# storage.origin_add([{'url': url}])
	storage.origin_add([Origin(url=url)])
	search.origin_update([{"url": url, "has_visits": True}])
	date = now()
	visit = OriginVisit(origin=url, date=date, type="tar")
	visit = storage.origin_visit_add([visit])[0]
	visit_status = OriginVisitStatus(
	origin=url,
	visit=visit.visit,
	date=date + timedelta(minutes=1),
	status="full",
	snapshot=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
	)
	storage.origin_visit_status_add([visit_status])

	sha1s: Set[Sha1] = set()
	directories = set()
	revisions = set()
	releases = set()
	snapshots = set()

	content_path = {}

	# Get all objects loaded into the test archive
	for origin in _TEST_ORIGINS:
	snp = snapshot_get_latest(storage, origin["url"])
	snapshots.add(hash_to_hex(snp.id))
	for branch_name, branch_data in snp.branches.items():
	target_type = branch_data.target_type.value
	if target_type == "revision":
	revisions.add(branch_data.target)
	elif target_type == "release":
	release = storage.release_get([branch_data.target])[0]
	revisions.add(release.target)
	releases.add(hash_to_hex(branch_data.target))

	for rev_log in storage.revision_shortlog(set(revisions)):
	rev_id = rev_log[0]
	revisions.add(rev_id)

	for rev in storage.revision_get(revisions):
	dir_id = rev["directory"]
	directories.add(hash_to_hex(dir_id))
	for entry in dir_iterator(storage, dir_id):
	if entry["type"] == "file":
	sha1s.add(entry["sha1"])
	content_path[entry["sha1"]] = "/".join(
	[hash_to_hex(dir_id), entry["path"].decode("utf-8")]
	)
	elif entry["type"] == "dir":
	directories.add(hash_to_hex(entry["target"]))

	_add_extra_contents(storage, sha1s)

	# Get all checksums for each content
	result: List[Optional[Content]] = storage.content_get(list(sha1s))

	contents: List[Dict] = []
	for content in result:
	assert content is not None
	sha1 = hash_to_hex(content.sha1)
	content_metadata = {
	algo: hash_to_hex(getattr(content, algo)) for algo in DEFAULT_ALGORITHMS
	}

	path = ""
	if content.sha1 in content_path:
	path = content_path[content.sha1]

	cnt_data = storage.content_get_data(content.sha1)
	assert cnt_data is not None
	mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data)
	_, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data)
	content_display_data = prepare_content_for_display(cnt_data, mimetype, path)

	content_metadata.update(
	{
	"path": path,
	"mimetype": mimetype,
	"encoding": encoding,
	"hljs_language": content_display_data["language"],
	"data": content_display_data["content_data"],
	}
	)
	_contents[sha1] = content_metadata
	contents.append(content_metadata)

	# Create indexer storage instance that will be shared by indexers
	idx_storage = get_indexer_storage("memory", {})

	# Add the empty directory to the test archive
	storage.directory_add([Directory(entries=())])

	# Return tests data
	return {
	"search": search,
	"storage": storage,
	"idx_storage": idx_storage,
	"origins": _TEST_ORIGINS,
	"contents": contents,
	"directories": list(directories),
	"releases": list(releases),
	"revisions": list(map(hash_to_hex, revisions)),
	"snapshots": list(snapshots),
	"generated_checksums": set(),
	}


	def _init_indexers(tests_data):
	# Instantiate content indexers that will be used in tests
	# and force them to use the memory storages
	indexers = {}
	for idx_name, idx_class in (
	("mimetype_indexer", _MimetypeIndexer),
	("license_indexer", _FossologyLicenseIndexer),
	("ctags_indexer", _CtagsIndexer),
	):
	idx = idx_class()
	idx.storage = tests_data["storage"]
	idx.objstorage = tests_data["storage"].objstorage
	idx.idx_storage = tests_data["idx_storage"]
	idx.register_tools(idx.config["tools"])
	indexers[idx_name] = idx

	return indexers


	def get_content(content_sha1):
	return _contents.get(content_sha1)


	_tests_data = None
	_current_tests_data = None
	_indexer_loggers = {}


	def get_tests_data(reset=False):
	"""
	Initialize tests data and return them in a dict.
	"""
	global _tests_data, _current_tests_data
	if _tests_data is None:
	_tests_data = _init_tests_data()
	indexers = _init_indexers(_tests_data)
	for (name, idx) in indexers.items():
	# pytest makes the loggers use a temporary file; and deepcopy
	# requires serializability. So we remove them, and add them
	# back after the copy.
	_indexer_loggers[name] = idx.log
	del idx.log
	_tests_data.update(indexers)
	if reset or _current_tests_data is None:
	_current_tests_data = deepcopy(_tests_data)
	for (name, logger) in _indexer_loggers.items():
	_current_tests_data[name].log = logger
	return _current_tests_data


	def override_storages(storage, idx_storage, search):
	"""
	Helper function to replace the storages from which archive data
	are fetched.
	"""
	swh_config = config.get_config()
	swh_config.update(
	{"storage": storage, "indexer_storage": idx_storage, "search": search,}
	)

	service.storage = storage
	service.idx_storage = idx_storage
	service.search = search

File Metadata

Mime Type: text/x-python
Expires: Jul 4 2025, 6:23 PM (5 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3372868

data.pyNo OneTemporaryActions

data.pyView Options

File Metadata

Event Timeline

data.py
No OneTemporary
Actions

data.py
View Options