diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index e1afc6b..583df11 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -1,387 +1,407 @@
 # Copyright (C) 2017-2018 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import enum
 import os
 import stat
 
-from typing import List
+import attr
+from typing import List, Optional
 
-from .hashutil import MultiHash, HASH_BLOCK_SIZE
+from .hashutil import MultiHash
 from .merkle import MerkleLeaf, MerkleNode
 from .identifiers import (
     directory_entry_sort_key, directory_identifier,
     identifier_to_bytes as id_to_bytes,
     identifier_to_str as id_to_str,
 )
+from . import model
+
+
+@attr.s
+class DiskBackedContent(model.Content):
+    """Subclass of Content, which allows lazy-loading data from the disk."""
+    path = attr.ib(type=Optional[bytes], default=None)
+
+    def __attrs_post_init__(self):
+        if self.path is None:
+            raise TypeError('path must not be None.')
+
+    def with_data(self) -> model.Content:
+        args = self.to_dict()
+        del args['path']
+        assert self.path is not None
+        with open(self.path, 'rb') as fd:
+            return model.Content.from_dict({
+                **args,
+                'data': fd.read()})
 
 
 class DentryPerms(enum.IntEnum):
     """Admissible permissions for directory entries."""
     content = 0o100644
     """Content"""
     executable_content = 0o100755
     """Executable content (e.g. executable script)"""
     symlink = 0o120000
     """Symbolic link"""
     directory = 0o040000
     """Directory"""
     revision = 0o160000
     """Revision (e.g. submodule)"""
 
 
 def mode_to_perms(mode):
     """Convert a file mode to a permission compatible with Software Heritage
     directory entries
 
     Args:
       mode (int): a file mode as returned by :func:`os.stat` in
                   :attr:`os.stat_result.st_mode`
 
     Returns:
       DentryPerms: one of the following values:
         :const:`DentryPerms.content`: plain file
         :const:`DentryPerms.executable_content`: executable file
         :const:`DentryPerms.symlink`: symbolic link
         :const:`DentryPerms.directory`: directory
 
     """
     if stat.S_ISLNK(mode):
         return DentryPerms.symlink
     if stat.S_ISDIR(mode):
         return DentryPerms.directory
     else:
         # file is executable in any way
         if mode & (0o111):
             return DentryPerms.executable_content
         else:
             return DentryPerms.content
 
 
 class Content(MerkleLeaf):
     """Representation of a Software Heritage content as a node in a Merkle tree.
 
     The current Merkle hash for the Content nodes is the `sha1_git`, which
     makes it consistent with what :class:`Directory` uses for its own hash
     computation.
 
     """
     __slots__ = []  # type: List[str]
     type = 'content'
 
     @classmethod
     def from_bytes(cls, *, mode, data):
         """Convert data (raw :class:`bytes`) to a Software Heritage content entry
 
         Args:
           mode (int): a file mode (passed to :func:`mode_to_perms`)
           data (bytes): raw contents of the file
         """
         ret = MultiHash.from_data(data).digest()
         ret['length'] = len(data)
         ret['perms'] = mode_to_perms(mode)
         ret['data'] = data
         ret['status'] = 'visible'
 
         return cls(ret)
 
     @classmethod
     def from_symlink(cls, *, path, mode):
         """Convert a symbolic link to a Software Heritage content entry"""
         return cls.from_bytes(mode=mode, data=os.readlink(path))
 
     @classmethod
     def from_file(
-            cls, *, path, data=False, save_path=False,
-            max_content_length=None):
+            cls, *, path, max_content_length=None):
         """Compute the Software Heritage content entry corresponding to an
         on-disk file.
 
         The returned dictionary contains keys useful for both:
         - loading the content in the archive (hashes, `length`)
         - using the content as a directory entry in a directory
 
         Args:
-          path (bytes): path to the file for which we're computing the
-            content entry
-          data (bool): add the file data to the entry
           save_path (bool): add the file path to the entry
           max_content_length (Optional[int]): if given, all contents larger
             than this will be skipped.
 
         """
         file_stat = os.lstat(path)
         mode = file_stat.st_mode
         length = file_stat.st_size
         too_large = max_content_length is not None \
             and length > max_content_length
 
         if stat.S_ISLNK(mode):
             # Symbolic link: return a file whose contents are the link target
 
             if too_large:
                 # Unlike large contents, we can't stream symlinks to
                 # MultiHash, and we don't want to fit them in memory if
                 # they exceed max_content_length either.
                 # Thankfully, this should not happen for reasonable values of
                 # max_content_length because of OS/filesystem limitations,
                 # so let's just raise an error.
                 raise Exception(f'Symlink too large ({length} bytes)')
 
             return cls.from_symlink(path=path, mode=mode)
         elif not stat.S_ISREG(mode):
             # not a regular file: return the empty file instead
             return cls.from_bytes(mode=mode, data=b'')
 
         if too_large:
             skip_reason = 'Content too large'
-        elif not data:
-            skip_reason = 'Skipping file content'
         else:
             skip_reason = None
 
+        hashes = MultiHash.from_path(path).digest()
         if skip_reason:
             ret = {
-                **MultiHash.from_path(path).digest(),
+                **hashes,
                 'status': 'absent',
                 'reason': skip_reason,
             }
         else:
-            h = MultiHash(length=length)
-            chunks = []
-            with open(path, 'rb') as fobj:
-                while True:
-                    chunk = fobj.read(HASH_BLOCK_SIZE)
-                    if not chunk:
-                        break
-                    h.update(chunk)
-                    chunks.append(chunk)
-
             ret = {
-                **h.digest(),
+                **hashes,
                 'status': 'visible',
-                'data': b''.join(chunks),
             }
 
-        if save_path:
-            ret['path'] = path
+        ret['path'] = path
         ret['perms'] = mode_to_perms(mode)
         ret['length'] = length
 
         obj = cls(ret)
         return obj
 
     def __repr__(self):
         return 'Content(id=%s)' % id_to_str(self.hash)
 
     def compute_hash(self):
         return self.data['sha1_git']
 
+    def to_model(self) -> model.BaseContent:
+        """Builds a `model.BaseContent` object based on this leaf."""
+        data = self.get_data().copy()
+        data.pop('perms', None)
+        if data['status'] == 'absent':
+            data.pop('path', None)
+            return model.SkippedContent.from_dict(data)
+        elif 'data' in data:
+            return model.Content.from_dict(data)
+        else:
+            return DiskBackedContent.from_dict(data)
+
 
 def accept_all_directories(dirname, entries):
     """Default filter for :func:`Directory.from_disk` accepting all
     directories
 
     Args:
       dirname (bytes): directory name
       entries (list): directory entries
     """
     return True
 
 
 def ignore_empty_directories(dirname, entries):
     """Filter for :func:`directory_to_objects` ignoring empty directories
 
     Args:
       dirname (bytes): directory name
       entries (list): directory entries
     Returns:
       True if the directory is not empty, false if the directory is empty
     """
     return bool(entries)
 
 
 def ignore_named_directories(names, *, case_sensitive=True):
     """Filter for :func:`directory_to_objects` to ignore directories named one
     of names.
 
     Args:
       names (list of bytes): names to ignore
       case_sensitive (bool): whether to do the filtering in a case sensitive
         way
     Returns:
       a directory filter for :func:`directory_to_objects`
     """
     if not case_sensitive:
         names = [name.lower() for name in names]
 
     def named_filter(dirname, entries,
                      names=names, case_sensitive=case_sensitive):
         if case_sensitive:
             return dirname not in names
         else:
             return dirname.lower() not in names
 
     return named_filter
 
 
 class Directory(MerkleNode):
     """Representation of a Software Heritage directory as a node in a Merkle Tree.
 
     This class can be used to generate, from an on-disk directory, all the
     objects that need to be sent to the Software Heritage archive.
 
     The :func:`from_disk` constructor allows you to generate the data structure
     from a directory on disk. The resulting :class:`Directory` can then be
     manipulated as a dictionary, using the path as key.
 
     The :func:`collect` method is used to retrieve all the objects that need to
     be added to the Software Heritage archive since the last collection, by
     class (contents and directories).
 
     When using the dict-like methods to update the contents of the directory,
     the affected levels of hierarchy are reset and can be collected again using
     the same method. This enables the efficient collection of updated nodes,
     for instance when the client is applying diffs.
     """
     __slots__ = ['__entries']
     type = 'directory'
 
     @classmethod
-    def from_disk(cls, *, path, data=False, save_path=False,
+    def from_disk(cls, *, path,
                   dir_filter=accept_all_directories,
                   max_content_length=None):
         """Compute the Software Heritage objects for a given directory tree
 
         Args:
           path (bytes): the directory to traverse
           data (bool): whether to add the data to the content objects
           save_path (bool): whether to add the path to the content objects
           dir_filter (function): a filter to ignore some directories by
             name or contents. Takes two arguments: dirname and entries, and
             returns True if the directory should be added, False if the
             directory should be ignored.
           max_content_length (Optional[int]): if given, all contents larger
             than this will be skipped.
         """
 
         top_path = path
         dirs = {}
 
         for root, dentries, fentries in os.walk(top_path, topdown=False):
             entries = {}
             # Join fentries and dentries in the same processing, as symbolic
             # links to directories appear in dentries...
             for name in fentries + dentries:
                 path = os.path.join(root, name)
                 if not os.path.isdir(path) or os.path.islink(path):
                     content = Content.from_file(
-                        path=path, data=data, save_path=save_path,
-                        max_content_length=max_content_length)
+                        path=path, max_content_length=max_content_length)
                     entries[name] = content
                 else:
                     if dir_filter(name, dirs[path].entries):
                         entries[name] = dirs[path]
 
             dirs[root] = cls({'name': os.path.basename(root)})
             dirs[root].update(entries)
 
         return dirs[top_path]
 
     def __init__(self, data=None):
         super().__init__(data=data)
         self.__entries = None
 
     def invalidate_hash(self):
         self.__entries = None
         super().invalidate_hash()
 
     @staticmethod
     def child_to_directory_entry(name, child):
         if isinstance(child, Directory):
             return {
                 'type': 'dir',
                 'perms': DentryPerms.directory,
                 'target': child.hash,
                 'name': name,
             }
         elif isinstance(child, Content):
             return {
                 'type': 'file',
                 'perms': child.data['perms'],
                 'target': child.hash,
                 'name': name,
             }
         else:
             raise ValueError('unknown child')
 
     def get_data(self, **kwargs):
         return {
             'id': self.hash,
             'entries': self.entries,
         }
 
     @property
     def entries(self):
         """Child nodes, sorted by name in the same way `directory_identifier`
         does."""
         if self.__entries is None:
             self.__entries = sorted((
                 self.child_to_directory_entry(name, child)
                 for name, child in self.items()
             ), key=directory_entry_sort_key)
 
         return self.__entries
 
     def compute_hash(self):
         return id_to_bytes(directory_identifier({'entries': self.entries}))
 
+    def to_model(self) -> model.Directory:
+        """Builds a `model.Directory` object based on this node;
+        ignoring its children."""
+        return model.Directory.from_dict(self.get_data())
+
     def __getitem__(self, key):
         if not isinstance(key, bytes):
             raise ValueError('Can only get a bytes from Directory')
 
         # Convenience shortcut
         if key == b'':
             return self
 
         if b'/' not in key:
             return super().__getitem__(key)
         else:
             key1, key2 = key.split(b'/', 1)
             return self.__getitem__(key1)[key2]
 
     def __setitem__(self, key, value):
         if not isinstance(key, bytes):
             raise ValueError('Can only set a bytes Directory entry')
         if not isinstance(value, (Content, Directory)):
             raise ValueError('Can only set a Directory entry to a Content or '
                              'Directory')
 
         if key == b'':
             raise ValueError('Directory entry must have a name')
         if b'\x00' in key:
             raise ValueError('Directory entry name must not contain nul bytes')
 
         if b'/' not in key:
             return super().__setitem__(key, value)
         else:
             key1, key2 = key.rsplit(b'/', 1)
             self[key1].__setitem__(key2, value)
 
     def __delitem__(self, key):
         if not isinstance(key, bytes):
             raise ValueError('Can only delete a bytes Directory entry')
 
         if b'/' not in key:
             super().__delitem__(key)
         else:
             key1, key2 = key.rsplit(b'/', 1)
             del self[key1][key2]
 
     def __repr__(self):
         return 'Directory(id=%s, entries=[%s])' % (
             id_to_str(self.hash),
             ', '.join(str(entry) for entry in self),
         )
diff --git a/swh/model/model.py b/swh/model/model.py
index 26accb6..d3c9c7d 100644
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -1,474 +1,495 @@
 # Copyright (C) 2018-2019 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 
 from abc import ABCMeta, abstractmethod
 from enum import Enum
 from typing import List, Optional, Dict
 
 import attr
 import dateutil.parser
 
 from .identifiers import (
     normalize_timestamp, directory_identifier, revision_identifier,
     release_identifier, snapshot_identifier
 )
 from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes
 
+
+class MissingData(Exception):
+    """Raised by `Content.with_data` when it has no way of fetching the
+    data (but not when fetching the data fails)."""
+    pass
+
+
 SHA1_SIZE = 20
 
 # TODO: Limit this to 20 bytes
 Sha1Git = bytes
 
 
 class BaseModel:
     """Base class for SWH model classes.
 
     Provides serialization/deserialization to/from Python dictionaries,
     that are suitable for JSON/msgpack-like formats."""
 
     def to_dict(self):
         """Wrapper of `attr.asdict` that can be overridden by subclasses
         that have special handling of some of the fields."""
 
         def dictify(value):
             if isinstance(value, BaseModel):
                 return value.to_dict()
             elif isinstance(value, Enum):
                 return value.value
             elif isinstance(value, dict):
                 return {k: dictify(v) for k, v in value.items()}
             elif isinstance(value, list):
                 return [dictify(v) for v in value]
             else:
                 return value
 
         ret = attr.asdict(self, recurse=False)
         return dictify(ret)
 
     @classmethod
     def from_dict(cls, d):
         """Takes a dictionary representing a tree of SWH objects, and
         recursively builds the corresponding objects."""
         return cls(**d)
 
 
 class HashableObject(metaclass=ABCMeta):
     """Mixin to automatically compute object identifier hash when
     the associated model is instantiated."""
 
     @staticmethod
     @abstractmethod
     def compute_hash(object_dict):
         """Derived model classes must implement this to compute
         the object hash from its dict representation."""
         pass
 
     def __attrs_post_init__(self):
         if not self.id:
             obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
             object.__setattr__(self, 'id', obj_id)
 
 
 @attr.s(frozen=True)
 class Person(BaseModel):
     """Represents the author/committer of a revision or release."""
     name = attr.ib(type=bytes)
     email = attr.ib(type=bytes)
     fullname = attr.ib(type=bytes)
 
 
 @attr.s(frozen=True)
 class Timestamp(BaseModel):
     """Represents a naive timestamp from a VCS."""
     seconds = attr.ib(type=int)
     microseconds = attr.ib(type=int)
 
     @seconds.validator
     def check_seconds(self, attribute, value):
         """Check that seconds fit in a 64-bits signed integer."""
         if not (-2**63 <= value < 2**63):
             raise ValueError('Seconds must be a signed 64-bits integer.')
 
     @microseconds.validator
     def check_microseconds(self, attribute, value):
         """Checks that microseconds are positive and < 1000000."""
         if not (0 <= value < 10**6):
             raise ValueError('Microseconds must be in [0, 1000000[.')
 
 
 @attr.s(frozen=True)
 class TimestampWithTimezone(BaseModel):
     """Represents a TZ-aware timestamp from a VCS."""
     timestamp = attr.ib(type=Timestamp)
     offset = attr.ib(type=int)
     negative_utc = attr.ib(type=bool)
 
     @offset.validator
     def check_offset(self, attribute, value):
         """Checks the offset is a 16-bits signed integer (in theory, it
         should always be between -14 and +14 hours)."""
         if not (-2**15 <= value < 2**15):
             # max 14 hours offset in theory, but you never know what
             # you'll find in the wild...
             raise ValueError('offset too large: %d minutes' % value)
 
     @classmethod
     def from_dict(cls, d):
         """Builds a TimestampWithTimezone from any of the formats
         accepted by :func:`swh.model.normalize_timestamp`."""
         d = normalize_timestamp(d)
         return cls(
             timestamp=Timestamp.from_dict(d['timestamp']),
             offset=d['offset'],
             negative_utc=d['negative_utc'])
 
 
 @attr.s(frozen=True)
 class Origin(BaseModel):
     """Represents a software source: a VCS and an URL."""
     url = attr.ib(type=str)
     type = attr.ib(type=Optional[str], default=None)
 
     def to_dict(self):
         r = super().to_dict()
         r.pop('type', None)
         return r
 
 
 @attr.s(frozen=True)
 class OriginVisit(BaseModel):
     """Represents a visit of an origin at a given point in time, by a
     SWH loader."""
     origin = attr.ib(type=str)
     date = attr.ib(type=datetime.datetime)
     status = attr.ib(
         type=str,
         validator=attr.validators.in_(['ongoing', 'full', 'partial']))
     type = attr.ib(type=str)
     snapshot = attr.ib(type=Optional[Sha1Git])
     metadata = attr.ib(type=Optional[Dict[str, object]],
                        default=None)
 
     visit = attr.ib(type=Optional[int],
                     default=None)
     """Should not be set before calling 'origin_visit_add()'."""
 
     def to_dict(self):
         """Serializes the date as a string and omits the visit id if it is
         `None`."""
         ov = super().to_dict()
         if ov['visit'] is None:
             del ov['visit']
         return ov
 
     @classmethod
     def from_dict(cls, d):
         """Parses the date from a string, and accepts missing visit ids."""
         d = d.copy()
         date = d.pop('date')
         return cls(
             date=(date
                   if isinstance(date, datetime.datetime)
                   else dateutil.parser.parse(date)),
             **d)
 
 
 class TargetType(Enum):
     """The type of content pointed to by a snapshot branch. Usually a
     revision or an alias."""
     CONTENT = 'content'
     DIRECTORY = 'directory'
     REVISION = 'revision'
     RELEASE = 'release'
     SNAPSHOT = 'snapshot'
     ALIAS = 'alias'
 
 
 class ObjectType(Enum):
     """The type of content pointed to by a release. Usually a revision"""
     CONTENT = 'content'
     DIRECTORY = 'directory'
     REVISION = 'revision'
     RELEASE = 'release'
     SNAPSHOT = 'snapshot'
 
 
 @attr.s(frozen=True)
 class SnapshotBranch(BaseModel):
     """Represents one of the branches of a snapshot."""
     target = attr.ib(type=bytes)
     target_type = attr.ib(type=TargetType)
 
     @target.validator
     def check_target(self, attribute, value):
         """Checks the target type is not an alias, checks the target is a
         valid sha1_git."""
         if self.target_type != TargetType.ALIAS and self.target is not None:
             if len(value) != 20:
                 raise ValueError('Wrong length for bytes identifier: %d' %
                                  len(value))
 
     @classmethod
     def from_dict(cls, d):
         return cls(
             target=d['target'],
             target_type=TargetType(d['target_type']))
 
 
 @attr.s(frozen=True)
 class Snapshot(BaseModel, HashableObject):
     """Represents the full state of an origin at a given point in time."""
     branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]])
     id = attr.ib(type=Sha1Git, default=b'')
 
     @staticmethod
     def compute_hash(object_dict):
         return snapshot_identifier(object_dict)
 
     @classmethod
     def from_dict(cls, d):
         d = d.copy()
         return cls(
             branches={
                 name: SnapshotBranch.from_dict(branch) if branch else None
                 for (name, branch) in d.pop('branches').items()
             },
             **d)
 
 
 @attr.s(frozen=True)
 class Release(BaseModel, HashableObject):
     name = attr.ib(type=bytes)
     message = attr.ib(type=bytes)
     target = attr.ib(type=Optional[Sha1Git])
     target_type = attr.ib(type=ObjectType)
     synthetic = attr.ib(type=bool)
     author = attr.ib(type=Optional[Person],
                      default=None)
     date = attr.ib(type=Optional[TimestampWithTimezone],
                    default=None)
     metadata = attr.ib(type=Optional[Dict[str, object]],
                        default=None)
     id = attr.ib(type=Sha1Git, default=b'')
 
     @staticmethod
     def compute_hash(object_dict):
         return release_identifier(object_dict)
 
     @author.validator
     def check_author(self, attribute, value):
         """If the author is `None`, checks the date is `None` too."""
         if self.author is None and self.date is not None:
             raise ValueError('release date must be None if author is None.')
 
     def to_dict(self):
         rel = super().to_dict()
         if rel['metadata'] is None:
             del rel['metadata']
         return rel
 
     @classmethod
     def from_dict(cls, d):
         d = d.copy()
         if d.get('author'):
             d['author'] = Person.from_dict(d['author'])
         if d.get('date'):
             d['date'] = TimestampWithTimezone.from_dict(d['date'])
         return cls(
             target_type=ObjectType(d.pop('target_type')),
             **d)
 
 
 class RevisionType(Enum):
     GIT = 'git'
     TAR = 'tar'
     DSC = 'dsc'
     SUBVERSION = 'svn'
     MERCURIAL = 'hg'
 
 
 @attr.s(frozen=True)
 class Revision(BaseModel, HashableObject):
     message = attr.ib(type=bytes)
     author = attr.ib(type=Person)
     committer = attr.ib(type=Person)
     date = attr.ib(type=Optional[TimestampWithTimezone])
     committer_date = attr.ib(type=Optional[TimestampWithTimezone])
     type = attr.ib(type=RevisionType)
     directory = attr.ib(type=Sha1Git)
     synthetic = attr.ib(type=bool)
     metadata = attr.ib(type=Optional[Dict[str, object]],
                        default=None)
     parents = attr.ib(type=List[Sha1Git],
                       default=attr.Factory(list))
     id = attr.ib(type=Sha1Git, default=b'')
 
     @staticmethod
     def compute_hash(object_dict):
         return revision_identifier(object_dict)
 
     @classmethod
     def from_dict(cls, d):
         d = d.copy()
         date = d.pop('date')
         if date:
             date = TimestampWithTimezone.from_dict(date)
 
         committer_date = d.pop('committer_date')
         if committer_date:
             committer_date = TimestampWithTimezone.from_dict(
                 committer_date)
 
         return cls(
             author=Person.from_dict(d.pop('author')),
             committer=Person.from_dict(d.pop('committer')),
             date=date,
             committer_date=committer_date,
             type=RevisionType(d.pop('type')),
             **d)
 
 
 @attr.s(frozen=True)
 class DirectoryEntry(BaseModel):
     name = attr.ib(type=bytes)
     type = attr.ib(type=str,
                    validator=attr.validators.in_(['file', 'dir', 'rev']))
     target = attr.ib(type=Sha1Git)
     perms = attr.ib(type=int)
     """Usually one of the values of `swh.model.from_disk.DentryPerms`."""
 
 
 @attr.s(frozen=True)
 class Directory(BaseModel, HashableObject):
     entries = attr.ib(type=List[DirectoryEntry])
     id = attr.ib(type=Sha1Git, default=b'')
 
     @staticmethod
     def compute_hash(object_dict):
         return directory_identifier(object_dict)
 
     @classmethod
     def from_dict(cls, d):
         d = d.copy()
         return cls(
             entries=[DirectoryEntry.from_dict(entry)
                      for entry in d.pop('entries')],
             **d)
 
 
 @attr.s(frozen=True)
 class BaseContent(BaseModel):
+    status = attr.ib(
+        type=str,
+        validator=attr.validators.in_(['visible', 'hidden', 'absent']))
+
     def to_dict(self):
         content = super().to_dict()
         if content['ctime'] is None:
             del content['ctime']
         return content
 
     @classmethod
     def from_dict(cls, d, use_subclass=True):
         if use_subclass:
             # Chooses a subclass to instantiate instead.
             if d['status'] == 'absent':
                 return SkippedContent.from_dict(d)
             else:
                 return Content.from_dict(d)
         else:
             return super().from_dict(d)
 
     def get_hash(self, hash_name):
         if hash_name not in DEFAULT_ALGORITHMS:
             raise ValueError('{} is not a valid hash name.'.format(hash_name))
         return getattr(self, hash_name)
 
     def hashes(self) -> Dict[str, bytes]:
         """Returns a dictionary {hash_name: hash_value}"""
         return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS}
 
 
 @attr.s(frozen=True)
 class Content(BaseContent):
     sha1 = attr.ib(type=bytes)
     sha1_git = attr.ib(type=Sha1Git)
     sha256 = attr.ib(type=bytes)
     blake2s256 = attr.ib(type=bytes)
 
     length = attr.ib(type=int)
 
     status = attr.ib(
         type=str,
         default='visible',
         validator=attr.validators.in_(['visible', 'hidden']))
-    data = attr.ib(type=Optional[bytes],
-                   default=None)
+
+    data = attr.ib(type=Optional[bytes], default=None)
 
     ctime = attr.ib(type=Optional[datetime.datetime],
                     default=None)
 
     @length.validator
     def check_length(self, attribute, value):
         """Checks the length is positive."""
         if value < 0:
             raise ValueError('Length must be positive.')
 
     def to_dict(self):
         content = super().to_dict()
         if content['data'] is None:
             del content['data']
         return content
 
     @classmethod
     def from_dict(cls, d):
         return super().from_dict(d, use_subclass=False)
 
+    def with_data(self) -> 'Content':
+        """Loads the `data` attribute; meaning that it is guaranteed not to
+        be None after this call.
+
+        This call is almost a no-op, but subclasses may overload this method
+        to lazy-load data (eg. from disk or objstorage)."""
+        if self.data is None:
+            raise MissingData('Content data is None.')
+        return self
+
 
 @attr.s(frozen=True)
 class SkippedContent(BaseContent):
     sha1 = attr.ib(type=Optional[bytes])
     sha1_git = attr.ib(type=Optional[Sha1Git])
     sha256 = attr.ib(type=Optional[bytes])
     blake2s256 = attr.ib(type=Optional[bytes])
 
-    length = attr.ib(type=int)
+    length = attr.ib(type=Optional[int])
 
     status = attr.ib(
         type=str,
         validator=attr.validators.in_(['absent']))
     reason = attr.ib(type=Optional[str],
                      default=None)
 
     origin = attr.ib(type=Optional[Origin],
                      default=None)
 
     ctime = attr.ib(type=Optional[datetime.datetime],
                     default=None)
 
     @reason.validator
     def check_reason(self, attribute, value):
         """Checks the reason is full if status != absent."""
         assert self.reason == value
         if value is None:
             raise ValueError('Must provide a reason if content is absent.')
 
     @length.validator
     def check_length(self, attribute, value):
         """Checks the length is positive or -1."""
         if value < -1:
             raise ValueError('Length must be positive or -1.')
 
     def to_dict(self):
         content = super().to_dict()
         if content['origin'] is None:
             del content['origin']
         return content
 
     @classmethod
     def from_dict(cls, d):
         d2 = d
         d = d.copy()
         if d.pop('data', None) is not None:
             raise ValueError('SkippedContent has no "data" attribute %r' % d2)
         return super().from_dict(d, use_subclass=False)
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
index 137ac14..d9881a1 100644
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -1,853 +1,963 @@
 # Copyright (C) 2017 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import pytest
 import tarfile
 import tempfile
 import unittest
 
 from typing import ClassVar, Optional
 
 from swh.model import from_disk
-from swh.model.from_disk import Content, DentryPerms, Directory
+from swh.model.from_disk import (
+    Content, DentryPerms, Directory, DiskBackedContent
+)
 from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
+from swh.model import model
 
 TEST_DATA = os.path.join(os.path.dirname(__file__), 'data')
 
 
 class ModeToPerms(unittest.TestCase):
     def setUp(self):
         super().setUp()
 
         # Generate a full permissions map
         self.perms_map = {}
 
         # Symlinks
         for i in range(0o120000, 0o127777 + 1):
             self.perms_map[i] = DentryPerms.symlink
 
         # Directories
         for i in range(0o040000, 0o047777 + 1):
             self.perms_map[i] = DentryPerms.directory
 
         # Other file types: socket, regular file, block device, character
         # device, fifo all map to regular files
         for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]:
             for i in range(ft, ft + 0o7777 + 1):
                 if i & 0o111:
                     # executable bits are set
                     self.perms_map[i] = DentryPerms.executable_content
                 else:
                     self.perms_map[i] = DentryPerms.content
 
     def test_exhaustive_mode_to_perms(self):
         for fmode, perm in self.perms_map.items():
             self.assertEqual(perm, from_disk.mode_to_perms(fmode))
 
 
+class TestDiskBackedContent(unittest.TestCase):
+    def test_with_data(self):
+        expected_content = model.Content(
+            length=42, status='visible', data=b'foo bar',
+            sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+        with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+            content = DiskBackedContent(
+                length=42, status='visible', path=fd.name,
+                sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+            fd.write(b'foo bar')
+            fd.seek(0)
+            content_with_data = content.with_data()
+
+        assert expected_content == content_with_data
+
+    def test_lazy_data(self):
+        with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+            fd.write(b'foo')
+            fd.seek(0)
+            content = DiskBackedContent(
+                length=42, status='visible', path=fd.name,
+                sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+            fd.write(b'bar')
+            fd.seek(0)
+            content_with_data = content.with_data()
+            fd.write(b'baz')
+            fd.seek(0)
+
+        assert content_with_data.data == b'bar'
+
+    def test_with_data_cannot_read(self):
+        with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+            content = DiskBackedContent(
+                length=42, status='visible', path=fd.name,
+                sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+        with pytest.raises(OSError):
+            content.with_data()
+
+    def test_missing_path(self):
+        with pytest.raises(TypeError):
+            DiskBackedContent(
+                length=42, status='visible',
+                sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+        with pytest.raises(TypeError):
+            DiskBackedContent(
+                length=42, status='visible', path=None,
+                sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+
 class DataMixin:
     maxDiff = None  # type: ClassVar[Optional[int]]
 
     def setUp(self):
         self.tmpdir = tempfile.TemporaryDirectory(
             prefix='swh.model.from_disk'
         )
         self.tmpdir_name = os.fsencode(self.tmpdir.name)
 
         self.contents = {
             b'file': {
                 'data': b'42\n',
                 'sha1': hash_to_bytes(
                     '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
                 ),
                 'sha256': hash_to_bytes(
                     '084c799cd551dd1d8d5c5f9a5d593b2e'
                     '931f5e36122ee5c793c1d08a19839cc0'
                 ),
                 'sha1_git': hash_to_bytes(
                     'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
                 'blake2s256': hash_to_bytes(
                     'd5fe1939576527e42cfd76a9455a2432'
                     'fe7f56669564577dd93c4280e76d661d'
                 ),
                 'length': 3,
                 'mode': 0o100644
             },
         }
 
         self.symlinks = {
             b'symlink': {
                 'data': b'target',
                 'blake2s256': hash_to_bytes(
                     '595d221b30fdd8e10e2fdf18376e688e'
                     '9f18d56fd9b6d1eb6a822f8c146c6da6'
                 ),
                 'sha1': hash_to_bytes(
                     '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
                 ),
                 'sha1_git': hash_to_bytes(
                     '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
                 ),
                 'sha256': hash_to_bytes(
                     '34a04005bcaf206eec990bd9637d9fdb'
                     '6725e0a0c0d4aebf003f17f4c956eb5c'
                 ),
                 'length': 6,
                 'perms': DentryPerms.symlink,
             }
         }
 
         self.specials = {
             b'fifo': os.mkfifo,
         }
 
         self.empty_content = {
             'data': b'',
             'length': 0,
             'blake2s256': hash_to_bytes(
                 '69217a3079908094e11121d042354a7c'
                 '1f55b6482ca1a51e1b250dfd1ed0eef9'
             ),
             'sha1': hash_to_bytes(
                 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
             ),
             'sha1_git': hash_to_bytes(
                 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
             ),
             'sha256': hash_to_bytes(
                 'e3b0c44298fc1c149afbf4c8996fb924'
                 '27ae41e4649b934ca495991b7852b855'
             ),
             'perms': DentryPerms.content,
         }
 
         self.empty_directory = {
             'id': hash_to_bytes(
                 '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
             ),
             'entries': [],
         }
 
         # Generated with generate_testdata_from_disk
         self.tarball_contents = {
             b'': {
                 'entries': [{
                     'name': b'bar',
                     'perms': DentryPerms.directory,
                     'target': hash_to_bytes(
                         '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
                     ),
                     'type': 'dir',
                 }, {
                     'name': b'empty-folder',
                     'perms': DentryPerms.directory,
                     'target': hash_to_bytes(
                         '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
                     ),
                     'type': 'dir',
                 }, {
                     'name': b'foo',
                     'perms': DentryPerms.directory,
                     'target': hash_to_bytes(
                         '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
                     ),
                     'type': 'dir',
                 }, {
                     'name': b'link-to-another-quote',
                     'perms': DentryPerms.symlink,
                     'target': hash_to_bytes(
                         '7d5c08111e21c8a9f71540939998551683375fad'
                     ),
                     'type': 'file',
                 }, {
                     'name': b'link-to-binary',
                     'perms': DentryPerms.symlink,
                     'target': hash_to_bytes(
                         'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
                     ),
                     'type': 'file',
                 }, {
                     'name': b'link-to-foo',
                     'perms': DentryPerms.symlink,
                     'target': hash_to_bytes(
                         '19102815663d23f8b75a47e7a01965dcdc96468c'
                     ),
                     'type': 'file',
                 }, {
                     'name': b'some-binary',
                     'perms': DentryPerms.executable_content,
                     'target': hash_to_bytes(
                         '68769579c3eaadbe555379b9c3538e6628bae1eb'
                     ),
                     'type': 'file',
                 }],
                 'id': hash_to_bytes(
                     'e8b0f1466af8608c8a3fb9879db172b887e80759'
                 ),
             },
             b'bar': {
                 'entries': [{
                     'name': b'barfoo',
                     'perms': DentryPerms.directory,
                     'target': hash_to_bytes(
                         'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
                     ),
                     'type': 'dir',
                 }],
                 'id': hash_to_bytes(
                     '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
                 ),
             },
             b'bar/barfoo': {
                 'entries': [{
                     'name': b'another-quote.org',
                     'perms': DentryPerms.content,
                     'target': hash_to_bytes(
                         '133693b125bad2b4ac318535b84901ebb1f6b638'
                     ),
                     'type': 'file',
                 }],
                 'id': hash_to_bytes(
                     'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
                 ),
             },
             b'bar/barfoo/another-quote.org': {
                 'blake2s256': hash_to_bytes(
                     'd26c1cad82d43df0bffa5e7be11a60e3'
                     '4adb85a218b433cbce5278b10b954fe8'
                 ),
                 'length': 72,
                 'perms': DentryPerms.content,
                 'sha1': hash_to_bytes(
                     '90a6138ba59915261e179948386aa1cc2aa9220a'
                 ),
                 'sha1_git': hash_to_bytes(
                     '133693b125bad2b4ac318535b84901ebb1f6b638'
                 ),
                 'sha256': hash_to_bytes(
                     '3db5ae168055bcd93a4d08285dc99ffe'
                     'e2883303b23fac5eab850273a8ea5546'
                 ),
             },
             b'empty-folder': {
                 'entries': [],
                 'id': hash_to_bytes(
                     '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
                 ),
             },
             b'foo': {
                 'entries': [{
                     'name': b'barfoo',
                     'perms': DentryPerms.symlink,
                     'target': hash_to_bytes(
                         '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
                     ),
                     'type': 'file',
                 }, {
                     'name': b'quotes.md',
                     'perms': DentryPerms.content,
                     'target': hash_to_bytes(
                         '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
                     ),
                     'type': 'file',
                 }, {
                     'name': b'rel-link-to-barfoo',
                     'perms': DentryPerms.symlink,
                     'target': hash_to_bytes(
                         'acac326ddd63b0bc70840659d4ac43619484e69f'
                     ),
                     'type': 'file',
                 }],
                 'id': hash_to_bytes(
                     '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
                 ),
             },
             b'foo/barfoo': {
                 'blake2s256': hash_to_bytes(
                     'e1252f2caa4a72653c4efd9af871b62b'
                     'f2abb7bb2f1b0e95969204bd8a70d4cd'
                 ),
                 'data': b'bar/barfoo',
                 'length': 10,
                 'perms': DentryPerms.symlink,
                 'sha1': hash_to_bytes(
                     '9057ee6d0162506e01c4d9d5459a7add1fedac37'
                 ),
                 'sha1_git': hash_to_bytes(
                     '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
                 ),
                 'sha256': hash_to_bytes(
                     '29ad3f5725321b940332c78e403601af'
                     'ff61daea85e9c80b4a7063b6887ead68'
                 ),
             },
             b'foo/quotes.md': {
                 'blake2s256': hash_to_bytes(
                     'bf7ce4fe304378651ee6348d3e9336ed'
                     '5ad603d33e83c83ba4e14b46f9b8a80b'
                 ),
                 'length': 66,
                 'perms': DentryPerms.content,
                 'sha1': hash_to_bytes(
                     '1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc'
                 ),
                 'sha1_git': hash_to_bytes(
                     '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
                 ),
                 'sha256': hash_to_bytes(
                     'caca942aeda7b308859eb56f909ec96d'
                     '07a499491690c453f73b9800a93b1659'
                 ),
             },
             b'foo/rel-link-to-barfoo': {
                 'blake2s256': hash_to_bytes(
                     'd9c327421588a1cf61f316615005a2e9'
                     'c13ac3a4e96d43a24138d718fa0e30db'
                 ),
                 'data': b'../bar/barfoo',
                 'length': 13,
                 'perms': DentryPerms.symlink,
                 'sha1': hash_to_bytes(
                     'dc51221d308f3aeb2754db48391b85687c2869f4'
                 ),
                 'sha1_git': hash_to_bytes(
                     'acac326ddd63b0bc70840659d4ac43619484e69f'
                 ),
                 'sha256': hash_to_bytes(
                     '8007d20db2af40435f42ddef4b8ad76b'
                     '80adbec26b249fdf0473353f8d99df08'
                 ),
             },
             b'link-to-another-quote': {
                 'blake2s256': hash_to_bytes(
                     '2d0e73cea01ba949c1022dc10c8a43e6'
                     '6180639662e5dc2737b843382f7b1910'
                 ),
                 'data': b'bar/barfoo/another-quote.org',
                 'length': 28,
                 'perms': DentryPerms.symlink,
                 'sha1': hash_to_bytes(
                     'cbeed15e79599c90de7383f420fed7acb48ea171'
                 ),
                 'sha1_git': hash_to_bytes(
                     '7d5c08111e21c8a9f71540939998551683375fad'
                 ),
                 'sha256': hash_to_bytes(
                     'e6e17d0793aa750a0440eb9ad5b80b25'
                     '8076637ef0fb68f3ac2e59e4b9ac3ba6'
                 ),
             },
             b'link-to-binary': {
                 'blake2s256': hash_to_bytes(
                     '9ce18b1adecb33f891ca36664da676e1'
                     '2c772cc193778aac9a137b8dc5834b9b'
                 ),
                 'data': b'some-binary',
                 'length': 11,
                 'perms': DentryPerms.symlink,
                 'sha1': hash_to_bytes(
                     'd0248714948b3a48a25438232a6f99f0318f59f1'
                 ),
                 'sha1_git': hash_to_bytes(
                     'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
                 ),
                 'sha256': hash_to_bytes(
                     '14126e97d83f7d261c5a6889cee73619'
                     '770ff09e40c5498685aba745be882eff'
                 ),
             },
             b'link-to-foo': {
                 'blake2s256': hash_to_bytes(
                     '08d6cad88075de8f192db097573d0e82'
                     '9411cd91eb6ec65e8fc16c017edfdb74'
                 ),
                 'data': b'foo',
                 'length': 3,
                 'perms': DentryPerms.symlink,
                 'sha1': hash_to_bytes(
                     '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33'
                 ),
                 'sha1_git': hash_to_bytes(
                     '19102815663d23f8b75a47e7a01965dcdc96468c'
                 ),
                 'sha256': hash_to_bytes(
                     '2c26b46b68ffc68ff99b453c1d304134'
                     '13422d706483bfa0f98a5e886266e7ae'
                 ),
             },
             b'some-binary': {
                 'blake2s256': hash_to_bytes(
                     '922e0f7015035212495b090c27577357'
                     'a740ddd77b0b9e0cd23b5480c07a18c6'
                 ),
                 'length': 5,
                 'perms': DentryPerms.executable_content,
                 'sha1': hash_to_bytes(
                     '0bbc12d7f4a2a15b143da84617d95cb223c9b23c'
                 ),
                 'sha1_git': hash_to_bytes(
                     '68769579c3eaadbe555379b9c3538e6628bae1eb'
                 ),
                 'sha256': hash_to_bytes(
                     'bac650d34a7638bb0aeb5342646d24e3'
                     'b9ad6b44c9b383621faa482b990a367d'
                 ),
             },
         }
 
     def tearDown(self):
         self.tmpdir.cleanup()
 
-    def assertContentEqual(self, left, right, *, check_data=False,  # noqa
+    def assertContentEqual(self, left, right, *,  # noqa
                            check_path=False):
         if not isinstance(left, Content):
             raise ValueError('%s is not a Content' % left)
         if isinstance(right, Content):
             right = right.get_data()
 
+        # Compare dictionaries
+
         keys = DEFAULT_ALGORITHMS | {
             'length',
             'perms',
         }
-        if check_data:
-            keys |= {'data'}
         if check_path:
             keys |= {'path'}
 
         failed = []
         for key in keys:
             try:
                 lvalue = left.data[key]
                 if key == 'perms' and 'perms' not in right:
                     rvalue = from_disk.mode_to_perms(right['mode'])
                 else:
                     rvalue = right[key]
             except KeyError:
                 failed.append(key)
                 continue
 
             if lvalue != rvalue:
                 failed.append(key)
 
         if failed:
             raise self.failureException(
                 'Content mismatched:\n' +
                 '\n'.join(
                     'content[%s] = %r != %r' % (
                         key, left.data.get(key), right.get(key))
                     for key in failed
                 )
             )
 
     def assertDirectoryEqual(self, left, right):  # NoQA
         if not isinstance(left, Directory):
             raise ValueError('%s is not a Directory' % left)
         if isinstance(right, Directory):
             right = right.get_data()
 
         assert left.entries == right['entries']
+        assert left.hash == right['id']
+
+        assert left.to_model() == model.Directory.from_dict(right)
 
     def make_contents(self, directory):
         for filename, content in self.contents.items():
             path = os.path.join(directory, filename)
             with open(path, 'wb') as f:
                 f.write(content['data'])
             os.chmod(path, content['mode'])
 
     def make_symlinks(self, directory):
         for filename, symlink in self.symlinks.items():
             path = os.path.join(directory, filename)
             os.symlink(symlink['data'], path)
 
     def make_specials(self, directory):
         for filename, fn in self.specials.items():
             path = os.path.join(directory, filename)
             fn(path)
 
     def make_from_tarball(self, directory):
         tarball = os.path.join(TEST_DATA, 'dir-folders', 'sample-folder.tgz')
 
         with tarfile.open(tarball, 'r:gz') as f:
             f.extractall(os.fsdecode(directory))
 
 
 class TestContent(DataMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
 
     def test_data_to_content(self):
         for filename, content in self.contents.items():
             conv_content = Content.from_bytes(mode=content['mode'],
                                               data=content['data'])
             self.assertContentEqual(conv_content, content)
             self.assertIn(hash_to_hex(conv_content.hash), repr(conv_content))
 
 
 class SymlinkToContent(DataMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         self.make_symlinks(self.tmpdir_name)
 
     def test_symlink_to_content(self):
         for filename, symlink in self.symlinks.items():
             path = os.path.join(self.tmpdir_name, filename)
             perms = 0o120000
             conv_content = Content.from_symlink(path=path, mode=perms)
             self.assertContentEqual(conv_content, symlink)
 
+    def test_symlink_to_base_model(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            perms = 0o120000
+            model_content = \
+                Content.from_symlink(path=path, mode=perms).to_model()
+
+            right = symlink.copy()
+            for key in ('perms', 'path', 'mode'):
+                right.pop(key, None)
+            right['status'] = 'visible'
+            assert model_content == model.Content.from_dict(right)
+
 
 class FileToContent(DataMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         self.make_contents(self.tmpdir_name)
         self.make_symlinks(self.tmpdir_name)
         self.make_specials(self.tmpdir_name)
 
     def test_symlink_to_content(self):
-        # Check whether loading the data works
-        for data in [True, False]:
-            for filename, symlink in self.symlinks.items():
-                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, symlink, check_data=data)
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, symlink)
 
     def test_file_to_content(self):
-        for data in [True, False]:
-            for filename, content in self.contents.items():
-                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, content, check_data=data)
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, content)
 
     def test_special_to_content(self):
-        for data in [True, False]:
-            for filename in self.specials:
-                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, self.empty_content)
+        for filename in self.specials:
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, self.empty_content)
 
-            for path in ['/dev/null', '/dev/zero']:
-                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, self.empty_content)
+        for path in ['/dev/null', '/dev/zero']:
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, self.empty_content)
+
+    def test_symlink_to_content_model(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = symlink.copy()
+            for key in ('perms', 'path', 'mode'):
+                right.pop(key, None)
+            right['status'] = 'visible'
+            assert model_content == model.Content.from_dict(right)
+
+    def test_file_to_content_model(self):
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = content.copy()
+            for key in ('perms', 'mode'):
+                right.pop(key, None)
+            assert model_content.with_data() == model.Content.from_dict(right)
+
+            right['path'] = path
+            del right['data']
+            assert model_content == DiskBackedContent.from_dict(right)
+
+    def test_special_to_content_model(self):
+        for filename in self.specials:
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = self.empty_content.copy()
+            for key in ('perms', 'path', 'mode'):
+                right.pop(key, None)
+            right['status'] = 'visible'
+            assert model_content == model.Content.from_dict(right)
+
+        for path in ['/dev/null', '/dev/zero']:
+            model_content = Content.from_file(path=path).to_model()
+
+            right = self.empty_content.copy()
+            for key in ('perms', 'path', 'mode'):
+                right.pop(key, None)
+            right['status'] = 'visible'
+            assert model_content == model.Content.from_dict(right)
 
     def test_symlink_max_length(self):
         for max_content_length in [4, 10]:
             for filename, symlink in self.symlinks.items():
                 path = os.path.join(self.tmpdir_name, filename)
-                content = Content.from_file(path=path, data=True)
+                content = Content.from_file(path=path)
                 if content.data['length'] > max_content_length:
                     with pytest.raises(Exception, match='too large'):
                         Content.from_file(
-                            path=path, data=True,
+                            path=path,
                             max_content_length=max_content_length)
                 else:
                     limited_content = Content.from_file(
-                        path=path, data=True,
+                        path=path,
                         max_content_length=max_content_length)
                     assert content == limited_content
 
     def test_file_max_length(self):
         for max_content_length in [2, 4]:
             for filename, content in self.contents.items():
                 path = os.path.join(self.tmpdir_name, filename)
-                content = Content.from_file(path=path, data=True)
+                content = Content.from_file(path=path)
                 limited_content = Content.from_file(
-                    path=path, data=True,
+                    path=path,
                     max_content_length=max_content_length)
                 assert content.data['length'] == limited_content.data['length']
                 assert content.data['status'] == 'visible'
                 if content.data['length'] > max_content_length:
                     assert limited_content.data['status'] == 'absent'
                     assert limited_content.data['reason'] \
                         == 'Content too large'
                 else:
                     assert limited_content.data['status'] == 'visible'
-                    assert limited_content.data['data'] == content.data['data']
 
     def test_special_file_max_length(self):
         for max_content_length in [None, 0, 1]:
             for filename in self.specials:
                 path = os.path.join(self.tmpdir_name, filename)
-                content = Content.from_file(path=path, data=True)
+                content = Content.from_file(path=path)
                 limited_content = Content.from_file(
-                    path=path, data=True,
+                    path=path,
                     max_content_length=max_content_length)
                 assert limited_content == content
 
     def test_file_to_content_with_path(self):
         for filename, content in self.contents.items():
             content_w_path = content.copy()
             path = os.path.join(self.tmpdir_name, filename)
             content_w_path['path'] = path
-            conv_content = Content.from_file(path=path, save_path=True)
+            conv_content = Content.from_file(path=path)
             self.assertContentEqual(conv_content, content_w_path,
                                     check_path=True)
 
 
 @pytest.mark.fs
 class DirectoryToObjects(DataMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         contents = os.path.join(self.tmpdir_name, b'contents')
         os.mkdir(contents)
         self.make_contents(contents)
         symlinks = os.path.join(self.tmpdir_name, b'symlinks')
         os.mkdir(symlinks)
         self.make_symlinks(symlinks)
         specials = os.path.join(self.tmpdir_name, b'specials')
         os.mkdir(specials)
         self.make_specials(specials)
         empties = os.path.join(self.tmpdir_name, b'empty1', b'empty2')
         os.makedirs(empties)
 
     def test_directory_to_objects(self):
         directory = Directory.from_disk(path=self.tmpdir_name)
 
         for name, value in self.contents.items():
             self.assertContentEqual(directory[b'contents/' + name], value)
 
         for name, value in self.symlinks.items():
             self.assertContentEqual(directory[b'symlinks/' + name], value)
 
         for name in self.specials:
             self.assertContentEqual(
                 directory[b'specials/' + name],
                 self.empty_content,
             )
 
         self.assertEqual(
             directory[b'empty1/empty2'].get_data(),
             self.empty_directory,
         )
 
         # Raise on non existent file
         with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
             directory[b'empty1/nonexistent']
 
         # Raise on non existent directory
         with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"):
             directory[b'nonexistentdir/file']
 
         objs = directory.collect()
 
         self.assertCountEqual(['content', 'directory'], objs)
 
         self.assertEqual(len(objs['directory']), 6)
         self.assertEqual(len(objs['content']),
                          len(self.contents)
                          + len(self.symlinks)
                          + 1)
 
     def test_directory_to_objects_ignore_empty(self):
         directory = Directory.from_disk(
             path=self.tmpdir_name,
             dir_filter=from_disk.ignore_empty_directories
         )
 
         for name, value in self.contents.items():
             self.assertContentEqual(directory[b'contents/' + name], value)
 
         for name, value in self.symlinks.items():
             self.assertContentEqual(directory[b'symlinks/' + name], value)
 
         for name in self.specials:
             self.assertContentEqual(
                 directory[b'specials/' + name],
                 self.empty_content,
             )
 
         # empty directories have been ignored recursively
         with self.assertRaisesRegex(KeyError, "b'empty1'"):
             directory[b'empty1']
         with self.assertRaisesRegex(KeyError, "b'empty1'"):
             directory[b'empty1/empty2']
 
         objs = directory.collect()
 
         self.assertCountEqual(['content', 'directory'], objs)
 
         self.assertEqual(len(objs['directory']), 4)
         self.assertEqual(len(objs['content']),
                          len(self.contents)
                          + len(self.symlinks)
                          + 1)
 
     def test_directory_to_objects_ignore_name(self):
         directory = Directory.from_disk(
             path=self.tmpdir_name,
             dir_filter=from_disk.ignore_named_directories([b'symlinks'])
         )
         for name, value in self.contents.items():
             self.assertContentEqual(directory[b'contents/' + name], value)
 
         for name in self.specials:
             self.assertContentEqual(
                 directory[b'specials/' + name],
                 self.empty_content,
             )
 
         self.assertEqual(
             directory[b'empty1/empty2'].get_data(),
             self.empty_directory,
         )
 
         with self.assertRaisesRegex(KeyError, "b'symlinks'"):
             directory[b'symlinks']
 
         objs = directory.collect()
 
         self.assertCountEqual(['content', 'directory'], objs)
 
         self.assertEqual(len(objs['directory']), 5)
         self.assertEqual(len(objs['content']),
                          len(self.contents)
                          + 1)
 
     def test_directory_to_objects_ignore_name_case(self):
         directory = Directory.from_disk(
             path=self.tmpdir_name,
             dir_filter=from_disk.ignore_named_directories([b'symLiNks'],
                                                           case_sensitive=False)
         )
         for name, value in self.contents.items():
             self.assertContentEqual(directory[b'contents/' + name], value)
 
         for name in self.specials:
             self.assertContentEqual(
                 directory[b'specials/' + name],
                 self.empty_content,
             )
 
         self.assertEqual(
             directory[b'empty1/empty2'].get_data(),
             self.empty_directory,
         )
 
         with self.assertRaisesRegex(KeyError, "b'symlinks'"):
             directory[b'symlinks']
 
         objs = directory.collect()
 
         self.assertCountEqual(['content', 'directory'], objs)
 
         self.assertEqual(len(objs['directory']), 5)
         self.assertEqual(len(objs['content']),
                          len(self.contents)
                          + 1)
 
     def test_directory_entry_order(self):
         with tempfile.TemporaryDirectory() as dirname:
             dirname = os.fsencode(dirname)
             open(os.path.join(dirname, b'foo.'), 'a')
             open(os.path.join(dirname, b'foo0'), 'a')
             os.mkdir(os.path.join(dirname, b'foo'))
 
             directory = Directory.from_disk(path=dirname)
 
         assert [entry['name'] for entry in directory.entries] \
             == [b'foo.', b'foo', b'foo0']
 
 
 @pytest.mark.fs
 class TarballTest(DataMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         self.make_from_tarball(self.tmpdir_name)
 
     def test_contents_match(self):
         directory = Directory.from_disk(
             path=os.path.join(self.tmpdir_name, b'sample-folder')
         )
 
-        for name, data in self.tarball_contents.items():
+        for name, expected in self.tarball_contents.items():
             obj = directory[name]
             if isinstance(obj, Content):
-                self.assertContentEqual(obj, data)
+                self.assertContentEqual(obj, expected)
             elif isinstance(obj, Directory):
-                self.assertDirectoryEqual(obj, data)
+                self.assertDirectoryEqual(obj, expected)
             else:
                 raise self.failureException('Unknown type for %s' % obj)
 
 
 class DirectoryManipulation(DataMixin, unittest.TestCase):
     def test_directory_access_nested(self):
         d = Directory()
         d[b'a'] = Directory()
         d[b'a/b'] = Directory()
 
         self.assertEqual(d[b'a/b'].get_data(), self.empty_directory)
 
     def test_directory_del_nested(self):
         d = Directory()
         d[b'a'] = Directory()
         d[b'a/b'] = Directory()
 
         with self.assertRaisesRegex(KeyError, "b'c'"):
             del d[b'a/b/c']
 
         with self.assertRaisesRegex(KeyError, "b'level2'"):
             del d[b'a/level2/c']
 
         del d[b'a/b']
 
         self.assertEqual(d[b'a'].get_data(), self.empty_directory)
 
     def test_directory_access_self(self):
         d = Directory()
         self.assertIs(d, d[b''])
         self.assertIs(d, d[b'/'])
         self.assertIs(d, d[b'//'])
 
     def test_directory_access_wrong_type(self):
         d = Directory()
         with self.assertRaisesRegex(ValueError, 'bytes from Directory'):
             d['foo']
         with self.assertRaisesRegex(ValueError, 'bytes from Directory'):
             d[42]
 
     def test_directory_repr(self):
         entries = [b'a', b'b', b'c']
         d = Directory()
         for entry in entries:
             d[entry] = Directory()
 
         r = repr(d)
         self.assertIn(hash_to_hex(d.hash), r)
 
         for entry in entries:
             self.assertIn(str(entry), r)
 
     def test_directory_set_wrong_type_name(self):
         d = Directory()
         with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
             d['foo'] = Directory()
         with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
             d[42] = Directory()
 
     def test_directory_set_nul_in_name(self):
         d = Directory()
 
         with self.assertRaisesRegex(ValueError, 'nul bytes'):
             d[b'\x00\x01'] = Directory()
 
     def test_directory_set_empty_name(self):
         d = Directory()
         with self.assertRaisesRegex(ValueError, 'must have a name'):
             d[b''] = Directory()
         with self.assertRaisesRegex(ValueError, 'must have a name'):
             d[b'/'] = Directory()
 
     def test_directory_set_wrong_type(self):
         d = Directory()
         with self.assertRaisesRegex(ValueError, 'Content or Directory'):
             d[b'entry'] = object()
 
     def test_directory_del_wrong_type(self):
         d = Directory()
         with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
             del d['foo']
         with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
             del d[42]
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
index e905725..82f3dc9 100644
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -1,121 +1,138 @@
 # Copyright (C) 2019 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 
 from hypothesis import given
+import pytest
 
 from swh.model.model import Content, Directory, Revision, Release, Snapshot
+from swh.model.model import MissingData
 from swh.model.hashutil import hash_to_bytes
 from swh.model.hypothesis_strategies import objects, origins, origin_visits
 from swh.model.identifiers import (
     directory_identifier, revision_identifier, release_identifier,
     snapshot_identifier
 )
 from swh.model.tests.test_identifiers import (
     directory_example, revision_example, release_example, snapshot_example
 )
 
 
 @given(objects())
 def test_todict_inverse_fromdict(objtype_and_obj):
     (obj_type, obj) = objtype_and_obj
 
     if obj_type in ('origin', 'origin_visit'):
         return
 
     obj_as_dict = obj.to_dict()
     obj_as_dict_copy = copy.deepcopy(obj_as_dict)
 
     # Check the composition of to_dict and from_dict is the identity
     assert obj == type(obj).from_dict(obj_as_dict)
 
     # Check from_dict() does not change the input dict
     assert obj_as_dict == obj_as_dict_copy
 
     # Check the composition of from_dict and to_dict is the identity
     assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
 
 
 @given(origins())
 def test_todict_origins(origin):
     obj = origin.to_dict()
 
     assert 'type' not in obj
     assert type(origin)(url=origin.url) == type(origin).from_dict(obj)
 
 
 @given(origin_visits())
 def test_todict_origin_visits(origin_visit):
     obj = origin_visit.to_dict()
 
     assert origin_visit == type(origin_visit).from_dict(obj)
 
 
 def test_content_get_hash():
     hashes = dict(
         sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
     c = Content(length=42, status='visible', **hashes)
     for (hash_name, hash_) in hashes.items():
         assert c.get_hash(hash_name) == hash_
 
 
 def test_content_hashes():
     hashes = dict(
         sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
     c = Content(length=42, status='visible', **hashes)
     assert c.hashes() == hashes
 
 
+def test_content_data():
+    c = Content(
+        length=42, status='visible', data=b'foo',
+        sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+    assert c.with_data() == c
+
+
+def test_content_data_missing():
+    c = Content(
+        length=42, status='visible',
+        sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+    with pytest.raises(MissingData):
+        c.with_data()
+
+
 def test_directory_model_id_computation():
     dir_dict = dict(directory_example)
     del dir_dict['id']
 
     dir_id = hash_to_bytes(directory_identifier(dir_dict))
     for dir_model in [Directory(**dir_dict), Directory.from_dict(dir_dict)]:
         assert dir_model.id == dir_id
 
 
 def test_revision_model_id_computation():
     rev_dict = dict(revision_example)
     del rev_dict['id']
 
     rev_id = hash_to_bytes(revision_identifier(rev_dict))
     for rev_model in [Revision(**rev_dict), Revision.from_dict(rev_dict)]:
         assert rev_model.id == rev_id
 
 
 def test_revision_model_id_computation_with_no_date():
     """We can have revision with date to None
 
     """
     rev_dict = dict(revision_example)
     rev_dict['date'] = None
     rev_dict['committer_date'] = None
     del rev_dict['id']
 
     rev_id = hash_to_bytes(revision_identifier(rev_dict))
     for rev_model in [Revision(**rev_dict), Revision.from_dict(rev_dict)]:
         assert rev_model.date is None
         assert rev_model.committer_date is None
         assert rev_model.id == rev_id
 
 
 def test_release_model_id_computation():
     rel_dict = dict(release_example)
     del rel_dict['id']
 
     rel_id = hash_to_bytes(release_identifier(rel_dict))
     for rel_model in [Release(**rel_dict), Release.from_dict(rel_dict)]:
         assert rel_model.id == hash_to_bytes(rel_id)
 
 
 def test_snapshot_model_id_computation():
     snp_dict = dict(snapshot_example)
     del snp_dict['id']
 
     snp_id = hash_to_bytes(snapshot_identifier(snp_dict))
     for snp_model in [Snapshot(**snp_dict), Snapshot.from_dict(snp_dict)]:
         assert snp_model.id == snp_id