Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163475
D6404.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D6404.id.diff
View Options
diff --git a/swh/loader/mercurial/objects.py b/swh/loader/mercurial/objects.py
deleted file mode 100644
--- a/swh/loader/mercurial/objects.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-"""This document contains various helper classes used in converting Mercurial
-bundle files into SWH Contents, Directories, etc.
-"""
-
-import binascii
-from collections import OrderedDict
-import copy
-import os
-import pickle
-import sqlite3
-import sys
-import zlib
-
-from sqlitedict import SqliteDict
-
-from swh.model import identifiers
-
-OS_PATH_SEP = os.path.sep.encode("utf-8")
-
-
-def _encode(obj):
- return sqlite3.Binary(zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))
-
-
-def _decode(obj):
- return pickle.loads(zlib.decompress(bytes(obj)))
-
-
-class SimpleBlob:
- """Stores basic metadata of a blob object.when constructing deep trees from
- commit file manifests.
-
- args:
- file_hash: unique hash of the file contents
- is_symlink: (bool) is this file a symlink?
- file_perms: (string) 3 digit permission code as a string or bytestring,
- e.g. '755' or b'755'
- """
-
- kind = "file"
-
- def __init__(self, file_hash, is_symlink, file_perms):
- self.hash = file_hash
- self.perms = 0o100000 + int(file_perms, 8)
- if is_symlink:
- self.perms += 0o020000
-
- def __str__(self):
- return "SimpleBlob: " + str(self.hash) + " -- " + str(self.perms)
-
- def __eq__(self, other):
- return (self.perms == other.perms) and (self.hash == other.hash)
-
- def size(self):
- """Return the size in byte."""
- return sys.getsizeof(self) + sys.getsizeof(self.__dict__)
-
-
-class SimpleTree(dict):
- """ Stores data for a nested directory object. Uses shallow cloning to stay
- compact after forking and change monitoring for efficient re-hashing.
- """
-
- kind = "dir"
- perms = 0o040000
-
- def __init__(self):
- self.hash = None
- self._size = None
-
- def __eq__(self, other):
- return (self.hash == other.hash) and (self.items() == other.items())
-
- def _new_tree_node(self, path):
- """Deeply nests SimpleTrees according to a given subdirectory path and
- returns a reference to the deepest one.
-
- args:
- path: bytestring containing a relative path from self to a deep
- subdirectory. e.g. b'foodir/bardir/bazdir'
-
- returns:
- the new node
- """
- node = self
- for d in path.split(OS_PATH_SEP):
- if node.get(d):
- if node[d].hash is not None:
- node[d] = copy.copy(node[d])
- node[d].hash = None
- node[d]._size = None
- else:
- node[d] = SimpleTree()
- node = node[d]
- return node
-
- def remove_tree_node_for_path(self, path):
- """Deletes a SimpleBlob or SimpleTree from inside nested SimpleTrees
- according to the given relative file path, and then recursively removes
- any newly depopulated SimpleTrees. It keeps the old history by doing a
- shallow clone before any change.
-
- args:
- path: bytestring containing a relative path from self to a nested
- file or directory. e.g. b'foodir/bardir/bazdir/quxfile.txt'
-
- returns:
- the new root node
- """
- node = self
- if node.hash is not None:
- node = copy.copy(node)
- node.hash = None
- node._size = None
- first, sep, rest = path.partition(OS_PATH_SEP)
- if rest:
- node[first] = node[first].remove_tree_node_for_path(rest)
- if len(node[first]) == 0:
- del node[first]
- else:
- del node[first]
-
- return node
-
- def add_blob(self, file_path, file_hash, is_symlink, file_perms):
- """Shallow clones the root node and then deeply nests a SimpleBlob
- inside nested SimpleTrees according to the given file path, shallow
- cloning all all intermediate nodes and marking them as changed and
- in need of new hashes.
-
- args:
- file_path: bytestring containing the relative path from self to a
- nested file
- file_hash: primary identifying hash computed from the blob contents
- is_symlink: True/False whether this item is a symbolic link
- file_perms: int or string representation of file permissions
-
- returns:
- the new root node
- """
- root = self
- if root.hash is not None:
- root = copy.copy(root)
- root.hash = None
- root._size = None
- node = root
- fdir, fbase = os.path.split(file_path)
- if fdir:
- node = root._new_tree_node(fdir)
- node[fbase] = SimpleBlob(file_hash, is_symlink, file_perms)
- return root
-
- def yield_swh_directories(self):
- """Converts nested SimpleTrees into a stream of SWH Directories.
-
- yields:
- an SWH Directory for every node in the tree
- """
- for k, v in sorted(self.items()):
- if isinstance(v, SimpleTree):
- yield from v.yield_swh_directories()
-
- yield {
- "id": self.hash,
- "entries": [
- {"name": k, "perms": v.perms, "type": v.kind, "target": v.hash}
- for k, v in sorted(self.items())
- ],
- }
-
- def hash_changed(self, new_dirs=None):
- """Computes and sets primary identifier hashes for unhashed subtrees.
-
- args:
- new_dirs (optional): an empty list to be populated with the SWH
- Directories for all of the new (not previously
- hashed) nodes
-
- returns:
- the top level hash of the whole tree
- """
- if self.hash is None:
- directory = {
- "entries": [
- {
- "name": k,
- "perms": v.perms,
- "type": v.kind,
- "target": (
- v.hash if v.hash is not None else v.hash_changed(new_dirs)
- ),
- }
- for k, v in sorted(self.items())
- ]
- }
-
- self.hash = binascii.unhexlify(identifiers.directory_identifier(directory))
- directory["id"] = self.hash
- if new_dirs is not None:
- new_dirs.append(directory)
-
- return self.hash
-
- def flatten(self, _curpath=None, _files=None):
- """Converts nested sub-SimpleTrees and SimpleBlobs into a list of
- file paths. Useful for counting the number of files in a manifest.
-
- returns:
- a flat list of all of the contained file paths
- """
- _curpath = _curpath or b""
- _files = _files or {}
- for k, v in sorted(self.items()):
- p = os.path.join(_curpath, k)
- if isinstance(v, SimpleBlob):
- _files[p] = (v.hash, v.perms)
- else:
- v.flatten(p, _files)
- return _files
-
- def size(self):
- """Return the (approximate?) memory utilization in bytes of the nested
- structure.
- """
- if self._size is None:
- self._size = (
- sys.getsizeof(self)
- + sys.getsizeof(self.__dict__)
- + sum([sys.getsizeof(k) + v.size() for k, v in self.items()])
- )
- return self._size
-
-
-class SelectiveCache(OrderedDict):
- """Special cache for storing past data upon which new data is known to be
- dependent. Optional hinting of how many instances of which keys will be
- needed down the line makes utilization more efficient. And, because the
- distance between related data can be arbitrarily long and the data
- fragments can be arbitrarily large, a disk-based secondary storage is used
- if the primary RAM-based storage area is filled to the designated capacity.
-
- Storage is occupied in three phases:
-
- 1) The most recent key/value pair is always held, regardless of other
- factors, until the next entry replaces it.
-
- 2) Stored key/value pairs are pushed into a randomly accessible
- expanding buffer in memory with a stored size function, maximum size
- value, and special hinting about which keys to store for how long
- optionally declared at instantiation.
-
- 3) The in-memory buffer pickles into a randomly accessible disk-backed
- secondary buffer when it becomes full.
-
- Occupied space is calculated by default as whatever the len() function
- returns on the values being stored. This can be changed by passing in a new
- size_function at instantiation.
-
- The cache_hints parameter is a dict of key/int pairs recording how many
- subsequent fetches that particular key's value should stay in storage for
- before being erased. If you provide a set of hints and then try to store a
- key that is not in that set of hints, the cache will store it only while it
- is the most recent entry, and will bypass storage phases 2 and 3.
- """
-
- DEFAULT_SIZE = 800 * 1024 * 1024 # bytes or whatever
-
- def __init__(
- self, max_size=None, cache_hints=None, size_function=None, filename=None
- ):
- """
- args:
- max_size: integer value indicating the maximum size of the part
- of storage held in memory
- cache_hints: dict of key/int pairs as described in the class
- description
- size_function: callback function that accepts one parameter and
- returns one int, which should probably be the calculated
- size of the parameter
- """
- self._max_size = max_size or SelectiveCache.DEFAULT_SIZE
- self._disk = None
- if size_function is None:
- self._size_function = sys.getsizeof
- else:
- self._size_function = size_function
- self._latest = None
- self._cache_size = 0
- self._cache_hints = copy.copy(cache_hints) or None
- self.filename = filename
-
- def store(self, key, data):
- """Primary method for putting data into the cache.
-
- args:
- key: any hashable value
- data: any python object (preferably one that is measurable)
-
- """
- self._latest = (key, data)
-
- if (self._cache_hints is not None) and (key not in self._cache_hints):
- return
-
- # cache the completed data...
- self._cache_size += self._size_function(data) + 53
-
- # ...but limit memory expenditure for the cache by offloading to disk
- should_commit = False
- while self._cache_size > self._max_size and len(self) > 0:
- should_commit = True
- k, v = self.popitem(last=False)
- self._cache_size -= self._size_function(v) - 53
- self._diskstore(k, v)
-
- if should_commit:
- self._disk.commit(blocking=False)
-
- self[key] = data
-
- def _diskstore(self, key, value):
- if self._disk is None:
- self._disk = SqliteDict(
- autocommit=False,
- journal_mode="OFF",
- filename=self.filename,
- tablename="swh",
- encode=_encode,
- decode=_decode,
- )
- self._disk.in_temp = True # necessary to force the disk clean up
- self._disk[key] = value
-
- def has(self, key):
- """Tests whether the data for the provided key is being stored.
-
- args:
- key: the key of the data whose storage membership property you wish
- to discover
-
- returns:
- True or False
- """
- return (
- (self._latest and self._latest[0] == key)
- or (key in self)
- or (self._disk and (key in self._disk))
- )
-
- def fetch(self, key):
- """Pulls a value out of storage and decrements the hint counter for the
- given key.
-
- args:
- key: the key of the data that you want to retrieve
-
- returns:
- the retrieved value or None
- """
- retval = None
- if self._latest and self._latest[0] == key:
- retval = self._latest[1]
- if retval is None:
- retval = self.get(key)
- if (retval is None) and self._disk:
- self._disk.commit(blocking=False)
- retval = self._disk.get(key) or None
- self.dereference(key)
- return retval
-
- def dereference(self, key):
- """Remove one instance of expected future retrieval of the data for the
- given key. This is called automatically by fetch requests that aren't
- satisfied by phase 1 of storage.
-
- args:
- the key of the data for which the future retrievals hint is to be
- decremented
- """
- newref = self._cache_hints and self._cache_hints.get(key)
- if newref:
- newref -= 1
- if newref == 0:
- del self._cache_hints[key]
- if key in self:
- item = self[key]
- self._cache_size -= self._size_function(item)
- del self[key]
- else:
- if self._disk:
- del self._disk[key]
- else:
- self._cache_hints[key] = newref
-
- def keys(self):
- yield from self.keys()
- if self._disk:
- yield from self._disk.keys()
-
- def values(self):
- yield from self.values()
- if self._disk:
- yield from self._disk.values()
-
- def items(self):
- yield from self.items()
- if self._disk:
- yield from self._disk.items()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 8:50 AM (20 h, 26 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215962
Attached To
D6404: Clean up missed unused module from early clean up
Event Timeline
Log In to Comment