diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -24,6 +24,9 @@
 [mypy-django.*]
 ignore_missing_imports = True
 
+[mypy-iso8601.*]
+ignore_missing_imports = True
+
 [mypy-msgpack.*]
 ignore_missing_imports = True
 
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@
 deprecated
 typing-extensions
 mypy_extensions
+iso8601
diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -0,0 +1,794 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import datetime
+import hashlib
+import json
+import os
+import re
+import sys
+from typing import Any, Dict, Optional
+from urllib.parse import unquote, urlparse
+
+import iso8601
+
+from swh.core.db import BaseDb
+from swh.model.hashutil import hash_to_hex
+from swh.model.identifiers import SWHID, parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    RawExtrinsicMetadata,
+)
+from swh.storage import get_storage
+
+CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+
+ATOM_NS = "http://www.w3.org/2005/Atom"
+ATOM_KEYS = ["id", "author", "external_identifier", "title"]
+
+REVISION_COLS = ["id", "date", "committer_date", "type", "message", "metadata"]
+
+DEPOSIT_COLS = [
+    "deposit.id",
+    "deposit.external_id",
+    "deposit.swh_id_context",
+    "deposit_request.metadata",
+    "deposit_request.date",
+    "deposit_client.provider_url",
+    "deposit_collection.name",
+    "auth_user.username",
+]
+
+OLD_DEPOSIT_FORMAT = (
+    "sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces"  # before february 2018
+)
+NEW_DEPOSIT_FORMAT = "sword-v2-atom-codemeta-v2-in-json"  # after february 2018
+GNU_FORMAT = "gnu-tree-json"
+NIXGUIX_FORMAT = "nixguix-sources-json"
+NPM_FORMAT = "replicate-npm-package-json"
+ORIGINAL_ARTIFACT_FORMAT = "original-artifacts-json"
+PYPI_FORMAT = "pypi-project-json"
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+AUTHORITIES = {
+    "npmjs": MetadataAuthority(
+        type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}
+    ),
+    "pypi": MetadataAuthority(
+        type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={}
+    ),
+    "gnu": MetadataAuthority(
+        type=MetadataAuthorityType.FORGE, url="https://ftp.gnu.org/", metadata={}
+    ),
+    "swh": MetadataAuthority(
+        type=MetadataAuthorityType.REGISTRY,
+        url="https://softwareheritage.org/",
+        metadata={},
+    ),
+}
+
+deposit_revision_message_re = re.compile(
+    b"(?P<client>[a-z]*): "
+    b"Deposit (?P<deposit_id>[0-9]+) in collection (?P<collection>[a-z]+).*"
+)
+
+
+def pypi_project_from_filename(filename):
+    match = re.match(
+        r"^(?P<project_name>[a-zA-Z0-9_.-]+)"
+        r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?\.(tar\.gz|zip)$",
+        filename,
+    )
+    assert match, filename
+    return match.group("project_name")
+
+
+def cran_package_from_url(filename):
+    match = re.match(
+        r"^https://cran\.r-project\.org/src/contrib/"
+        r"(?P<package_name>[a-zA-Z0-9.]+)_[0-9.-]+(\.tar\.gz)?$",
+        filename,
+    )
+    assert match, filename
+    return match.group("package_name")
+
+
+def npm_package_from_source_url(package_source_url):
+    match = re.match(
+        "^https://registry.npmjs.org/(?P<package_name>.*)/-/[^/]+.tgz$",
+        package_source_url,
+    )
+    assert match, package_source_url
+    return unquote(match.group("package_name"))
+
+
+def remove_atom_codemeta_metadata_with_xmlns(metadata):
+    keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"]
+    for key in list(metadata):
+        if key.startswith("codemeta:") or key in keys_to_remove:
+            del metadata[key]
+
+
+def remove_atom_codemeta_metadata_without_xmlns(metadata):
+    for key in list(metadata):
+        if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)):
+            del metadata[key]
+
+
+_origins = set()
+
+
+def assert_origin_exists(storage, origin):
+    assert (
+        hashlib.sha1(origin.encode()).digest() in _origins
+        or storage.origin_get([origin])[0] is not None
+    ), origin
+
+
+def load_metadata(
+    storage,
+    revision_id,
+    discovery_date: datetime.datetime,
+    metadata: Dict[str, Any],
+    format: str,
+    authority: MetadataAuthority,
+    origin: Optional[str],
+    dry_run: bool,
+):
+    revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id))
+    obj = RawExtrinsicMetadata(
+        type=MetadataTargetType.REVISION,
+        id=revision_swhid,
+        discovery_date=discovery_date,
+        authority=authority,
+        fetcher=FETCHER,
+        format=format,
+        metadata=json.dumps(metadata).encode(),
+        origin=origin,
+    )
+    if not dry_run:
+        storage.raw_extrinsic_metadata_add([obj])
+
+
+def handle_deposit_row(
+    row,
+    discovery_date: Optional[datetime.datetime],
+    origin,
+    storage,
+    deposit_cur,
+    dry_run: bool,
+):
+    parsed_message = deposit_revision_message_re.match(row["message"])
+    assert parsed_message is not None, row["message"]
+
+    deposit_id = int(parsed_message.group("deposit_id"))
+    collection = parsed_message.group("collection").decode()
+    client_name = parsed_message.group("client").decode()
+
+    deposit_cur.execute(
+        f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit "
+        f"INNER JOIN deposit_collection "
+        f" ON (deposit.collection_id=deposit_collection.id) "
+        f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) "
+        f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) "
+        f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) "
+        f"WHERE deposit.id = %s",
+        (deposit_id,),
+    )
+
+    provider_urls = set()
+    swhids = set()
+    metadata_entries = []
+    dates = set()
+    external_identifiers = set()
+    for deposit_request_row in deposit_cur:
+        deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row))
+
+        # Sanity checks to make sure we selected the right deposit
+        assert deposit_request["deposit.id"] == deposit_id
+        assert deposit_request["deposit_collection.name"] == collection, deposit_request
+        if client_name != "":
+            # Sometimes it's missing from the commit message
+            assert deposit_request["auth_user.username"] == client_name
+
+        date = deposit_request["deposit_request.date"]
+        external_identifiers.add(deposit_request["deposit.external_id"])
+        swhids.add(deposit_request["deposit.swh_id_context"])
+        dates.add(date)
+        provider_urls.add(deposit_request["deposit_client.provider_url"])
+        metadata = deposit_request["deposit_request.metadata"]
+        if metadata is not None:
+            json.dumps(metadata).encode()  # check it's valid
+            if "@xmlns" in metadata:
+                assert metadata["@xmlns"] == ATOM_NS
+                assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+                format = NEW_DEPOSIT_FORMAT
+            else:
+                assert "{http://www.w3.org/2005/Atom}id" in metadata
+                assert "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
+                format = OLD_DEPOSIT_FORMAT
+            metadata_entries.append((date, format, metadata))
+
+    if discovery_date is None:
+        discovery_date = max(dates)
+
+    assert len(metadata_entries) >= 1, deposit_id
+    assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}"
+    (provider_url,) = provider_urls
+    assert len(swhids) == 1
+    (swhid,) = swhids
+    assert (
+        len(external_identifiers) == 1
+    ), f"expected 1 external identifier, got {external_identifiers}"
+    (external_identifier,) = external_identifiers
+
+    # computed the origin from the external_identifier if we don't have one
+    if origin is None:
+        origin = f"{provider_url.strip('/')}/{external_identifier}"
+
+        # explicit list of mistakes that happened in the past, but shouldn't
+        # happen again:
+        if origin == "https://hal.archives-ouvertes.fr/hal-01588781":
+            # deposit id 75
+            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+        elif origin == "https://hal.archives-ouvertes.fr/hal-01588782":
+            # deposit id 76
+            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782"
+        elif origin == "https://hal.archives-ouvertes.fr/hal-01592430":
+            # deposit id 143
+            origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592430"
+
+        assert_origin_exists(storage, origin)
+
+    # check the origin we computed matches the one in the deposit db
+    swhid_origin = parse_swhid(swhid).metadata["origin"]
+    if origin is not None:
+        # explicit list of mistakes that happened in the past, but shouldn't
+        # happen again:
+        exceptions = [
+            (
+                # deposit id 229
+                "https://hal.archives-ouvertes.fr/hal-01243573",
+                "https://hal-test.archives-ouvertes.fr/hal-01243573",
+            ),
+            (
+                # deposit id 199
+                "https://hal.archives-ouvertes.fr/hal-01243065",
+                "https://hal-test.archives-ouvertes.fr/hal-01243065",
+            ),
+            (
+                # deposit id 164
+                "https://hal.archives-ouvertes.fr/hal-01593855",
+                "https://hal-preprod.archives-ouvertes.fr/hal-01593855",
+            ),
+        ]
+        if (origin, swhid_origin) not in exceptions:
+            assert origin == swhid_origin, (
+                f"the origin we guessed from the deposit db or revision ({origin}) "
+                f"doesn't match the one in the deposit db's SWHID ({swhid})"
+            )
+
+    authority = MetadataAuthority(
+        type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider_url, metadata={},
+    )
+
+    for (date, format, metadata) in metadata_entries:
+        load_metadata(
+            storage,
+            row["id"],
+            date,
+            metadata,
+            format,
+            authority=authority,
+            origin=origin,
+            dry_run=dry_run,
+        )
+
+    return (origin, discovery_date)
+
+
+def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool):
+    type_ = row["type"]
+    discovery_date = row["date"] or row["committer_date"]
+
+    metadata = row["metadata"]
+
+    if metadata is None:
+        return
+
+    if type_ == "dsc":
+        origin = None  # TODO: I can't find how to get it reliably
+
+        # TODO: the debian loader writes the changelog date as the revision's
+        # author date and committer date. Instead, we should use the visit's date,
+        # but I cannot find a way to reliably get it without the origin
+
+        if "extrinsic" in metadata:
+            extrinsic_files = metadata["extrinsic"]["raw"]["files"]
+            for artifact_entry in metadata["original_artifact"]:
+                extrinsic_file = extrinsic_files[artifact_entry["filename"]]
+                for key in ("sha256",):
+                    assert artifact_entry["checksums"][key] == extrinsic_file[key]
+                    artifact_entry["url"] = extrinsic_file["uri"]
+            del metadata["extrinsic"]
+
+    elif type_ == "tar":
+        provider = metadata.get("extrinsic", {}).get("provider")
+        if provider is not None:
+            discovery_date = iso8601.parse_date(metadata["extrinsic"]["when"])
+
+            # New versions of the loaders write the provider; use it.
+            if provider.startswith("https://replicate.npmjs.com/"):
+                # npm loader format 1
+
+                parsed_url = urlparse(provider)
+                assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url
+                package_name = unquote(parsed_url.path.strip("/"))
+                origin = "https://www.npmjs.com/package/" + package_name
+                assert_origin_exists(storage, origin)
+
+                load_metadata(
+                    storage,
+                    row["id"],
+                    discovery_date,
+                    metadata["extrinsic"]["raw"],
+                    NPM_FORMAT,
+                    authority=AUTHORITIES["npmjs"],
+                    origin=origin,
+                    dry_run=dry_run,
+                )
+                del metadata["extrinsic"]
+
+            elif provider.startswith("https://pypi.org/"):
+                # pypi loader format 1
+
+                match = re.match(
+                    "https://pypi.org/pypi/(?P<project_name>.*)/json", provider
+                )
+                assert match, f"unexpected provider URL format: {provider}"
+                project_name = match.group("project_name")
+                origin = f"https://pypi.org/project/{project_name}/"
+                assert_origin_exists(storage, origin)
+
+                load_metadata(
+                    storage,
+                    row["id"],
+                    discovery_date,
+                    metadata["extrinsic"]["raw"],
+                    PYPI_FORMAT,
+                    authority=AUTHORITIES["pypi"],
+                    origin=origin,
+                    dry_run=dry_run,
+                )
+                del metadata["extrinsic"]
+
+            elif provider.startswith("https://cran.r-project.org/"):
+                # cran loader
+
+                provider = metadata["extrinsic"]["provider"]
+                if provider.startswith("https://cran.r-project.org/package="):
+                    origin = metadata["extrinsic"]["provider"]
+                else:
+                    package_name = cran_package_from_url(provider)
+                    origin = f"https://cran.r-project.org/package={package_name}"
+                assert_origin_exists(storage, origin)
+
+                raw_extrinsic_metadata = metadata["extrinsic"]["raw"]
+
+                # this is actually intrinsic, ignore it
+                del raw_extrinsic_metadata["version"]
+
+                # Copy the URL to the original_artifacts metadata
+                assert len(metadata["original_artifact"]) == 1
+                assert "url" not in metadata["original_artifact"][0]
+                metadata["original_artifact"][0]["url"] = raw_extrinsic_metadata["url"]
+                del raw_extrinsic_metadata["url"]
+
+                assert (
+                    raw_extrinsic_metadata == {}
+                ), f"Unexpected metadata keys: {list(raw_extrinsic_metadata)}"
+
+                del metadata["extrinsic"]
+
+            elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"):
+                # nixguix loader
+                origin = provider
+                assert_origin_exists(storage, origin)
+
+                authority = MetadataAuthority(
+                    type=MetadataAuthorityType.FORGE, url=provider, metadata={},
+                )
+                assert row["date"] is None  # the nixguix loader does not write dates
+
+                load_metadata(
+                    storage,
+                    row["id"],
+                    discovery_date,
+                    metadata["extrinsic"]["raw"],
+                    NIXGUIX_FORMAT,
+                    authority=authority,
+                    origin=origin,
+                    dry_run=dry_run,
+                )
+                del metadata["extrinsic"]
+
+            elif provider.startswith("https://ftp.gnu.org/"):
+                # archive loader format 1
+
+                origin = provider
+                assert_origin_exists(storage, origin)
+
+                assert len(metadata["original_artifact"]) == 1
+                metadata["original_artifact"][0]["url"] = metadata["extrinsic"]["raw"][
+                    "url"
+                ]
+
+                # Remove duplicate keys of original_artifacts
+                for key in ("url", "time", "length", "version", "filename"):
+                    del metadata["extrinsic"]["raw"][key]
+
+                assert metadata["extrinsic"]["raw"] == {}
+                del metadata["extrinsic"]
+
+            elif provider.startswith("https://deposit.softwareheritage.org/"):
+                origin = metadata["extrinsic"]["raw"]["origin"]["url"]
+                assert_origin_exists(storage, origin)
+
+                if "@xmlns" in metadata:
+                    assert metadata["@xmlns"] == ATOM_NS
+                    assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+                    assert "intrinsic" not in metadata
+                    assert "extra_headers" not in metadata
+
+                    # deposit loader format 1
+                    # in this case, the metadata seems to be both directly in metadata
+                    # and in metadata["extrinsic"]["raw"]["metadata"]
+
+                    (origin, discovery_date) = handle_deposit_row(
+                        row, discovery_date, origin, storage, deposit_cur, dry_run
+                    )
+
+                    remove_atom_codemeta_metadata_with_xmlns(metadata)
+                    if "client" in metadata:
+                        del metadata["client"]
+                    del metadata["extrinsic"]
+                else:
+                    # deposit loader format 2
+                    actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][
+                        "metadata"
+                    ]
+                    if "@xmlns" in actual_metadata:
+                        assert actual_metadata["@xmlns"] == ATOM_NS
+                        assert actual_metadata["@xmlns:codemeta"] in (
+                            CODEMETA_NS,
+                            [CODEMETA_NS],
+                        )
+                    else:
+                        assert "{http://www.w3.org/2005/Atom}id" in actual_metadata
+                        assert (
+                            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
+                            in actual_metadata
+                        )
+
+                    (origin, discovery_date) = handle_deposit_row(
+                        row, discovery_date, origin, storage, deposit_cur, dry_run
+                    )
+
+                    del metadata["extrinsic"]
+            else:
+                assert False, f"unknown provider {provider}"
+
+        # Older versions don't write the provider; use heuristics instead.
+        elif (
+            metadata.get("package_source", {})
+            .get("url", "")
+            .startswith("https://registry.npmjs.org/")
+        ):
+            # npm loader format 2
+
+            package_source_url = metadata["package_source"]["url"]
+            package_name = npm_package_from_source_url(package_source_url)
+            origin = "https://www.npmjs.com/package/" + package_name
+            assert_origin_exists(storage, origin)
+
+            load_metadata(
+                storage,
+                row["id"],
+                discovery_date,
+                metadata["package"],
+                NPM_FORMAT,
+                authority=AUTHORITIES["npmjs"],
+                origin=origin,
+                dry_run=dry_run,
+            )
+            del metadata["package"]
+
+            assert "original_artifact" not in metadata
+
+            # rebuild an "original_artifact"-like metadata dict from what we
+            # can salvage of "package_source"
+            package_source_metadata = metadata["package_source"]
+            keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"}
+            discard_keys = {
+                "date",  # is equal to the revision date
+                "name",  # was loaded above
+                "version",  # same
+            }
+            assert (
+                set(package_source_metadata) == keep_keys | discard_keys
+            ), package_source_metadata
+
+            # will be loaded below
+            metadata["original_artifact"] = [
+                {
+                    "filename": package_source_metadata["filename"],
+                    "checksums": {
+                        "sha1": package_source_metadata["sha1"],
+                        "sha256": package_source_metadata["sha256"],
+                        "blake2s256": package_source_metadata["blake2s256"],
+                    },
+                    "url": package_source_metadata["url"],
+                }
+            ]
+            del metadata["package_source"]
+
+        elif "@xmlns" in metadata:
+            assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+            assert "intrinsic" not in metadata
+            assert "extra_headers" not in metadata
+
+            # deposit loader format 3 and 4
+
+            origin = None  # TODO
+            discovery_date = None  # TODO
+
+            (origin, discovery_date) = handle_deposit_row(
+                row, discovery_date, origin, storage, deposit_cur, dry_run
+            )
+            remove_atom_codemeta_metadata_with_xmlns(metadata)
+            if "client" in metadata:
+                del metadata["client"]  # found in the deposit db
+            if "committer" in metadata:
+                del metadata["committer"]  # found on the revision object
+
+        elif "{http://www.w3.org/2005/Atom}id" in metadata:
+            assert "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
+            assert "intrinsic" not in metadata
+            assert "extra_headers" not in metadata
+
+            # deposit loader format 5
+
+            origin = None
+            discovery_date = None  # TODO
+
+            (origin, discovery_date) = handle_deposit_row(
+                row, discovery_date, origin, storage, deposit_cur, dry_run
+            )
+            remove_atom_codemeta_metadata_without_xmlns(metadata)
+
+        elif (
+            isinstance(metadata.get("original_artifact"), dict)
+            and metadata["original_artifact"]["url"].startswith(
+                "https://files.pythonhosted.org/"
+            )
+        ) or (
+            isinstance(metadata.get("original_artifact"), list)
+            and len(metadata.get("original_artifact")) == 1
+            and metadata["original_artifact"][0]
+            .get("url", "")
+            .startswith("https://files.pythonhosted.org/")
+        ):
+            if isinstance(metadata.get("original_artifact"), dict):
+                metadata["original_artifact"] = [metadata["original_artifact"]]
+
+            assert len(metadata["original_artifact"]) == 1
+
+            # it's tempting here to do this:
+            #
+            #   project_name = pypi_project_from_filename(
+            #       metadata["original_artifact"][0]["filename"]
+            #   )
+            #   origin = f"https://pypi.org/project/{project_name}/"
+            #   assert_origin_exists(storage, origin)
+            #
+            # but unfortunately, the filename is user-provider, and doesn't
+            # necessarily match the package name on pypi.
+
+            origin = None
+
+            if "project" in metadata:
+                # pypi loader format 2
+
+                # same reason as above, we can't do this:
+                #   if metadata["project"]:
+                #       assert metadata["project"]["name"] == project_name
+
+                load_metadata(
+                    storage,
+                    row["id"],
+                    discovery_date,
+                    metadata["project"],
+                    PYPI_FORMAT,
+                    authority=AUTHORITIES["pypi"],
+                    origin=origin,
+                    dry_run=dry_run,
+                )
+                del metadata["project"]
+            else:
+                assert set(metadata) == {"original_artifact"}, set(metadata)
+                # pypi loader format 3
+                pass  # nothing to do, there's no metadata
+
+        elif row["message"] == b"synthetic revision message":
+            assert isinstance(metadata["original_artifact"], list), metadata
+            assert not any("url" in d for d in metadata["original_artifact"])
+
+            # archive loader format 2
+
+            origin = None
+
+        elif deposit_revision_message_re.match(row["message"]):
+            # deposit without metadata in the revision
+
+            assert set(metadata) == {"original_artifact"}, metadata
+
+            origin = None  # TODO
+            discovery_date = None
+
+            (origin, discovery_date) = handle_deposit_row(
+                row, discovery_date, origin, storage, deposit_cur, dry_run
+            )
+        else:
+            assert False, f"Unable to detect type of metadata for row: {row}"
+
+    # Ignore common intrinsic metadata keys
+    for key in ("intrinsic", "extra_headers"):
+        if key in metadata:
+            del metadata[key]
+
+    # Ignore loader-specific intrinsic metadata keys
+    if type_ == "hg":
+        del metadata["node"]
+    elif type_ == "dsc":
+        if "package_info" in metadata:
+            del metadata["package_info"]
+
+    if "original_artifact" in metadata:
+        for original_artifact in metadata["original_artifact"]:
+            # Rename keys to the expected format of original-artifacts-json.
+            rename_keys = [
+                ("name", "filename"),  # eg. from old Debian loader
+                ("size", "length"),  # eg. from old PyPI loader
+            ]
+            for (old_name, new_name) in rename_keys:
+                if old_name in original_artifact:
+                    assert new_name not in original_artifact
+                    original_artifact[new_name] = original_artifact.pop(old_name)
+
+            # Move the checksums to their own subdict, which is the expected format
+            # of original-artifacts-json.
+            if "sha1" in original_artifact:
+                assert "checksums" not in original_artifact
+                original_artifact["checksums"] = {}
+                for key in ("sha1", "sha256", "sha1_git", "blake2s256"):
+                    if key in original_artifact:
+                        original_artifact["checksums"][key] = original_artifact.pop(key)
+
+            allowed_keys = {
+                "checksums",
+                "date",
+                "filename",
+                "length",
+                "url",
+                "archive_type",
+            }
+            assert set(original_artifact) <= allowed_keys, set(original_artifact)
+
+        load_metadata(
+            storage,
+            row["id"],
+            discovery_date,
+            metadata["original_artifact"],
+            ORIGINAL_ARTIFACT_FORMAT,
+            authority=AUTHORITIES["swh"],
+            origin=origin,
+            dry_run=dry_run,
+        )
+        del metadata["original_artifact"]
+
+    assert metadata == {}, (
+        f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): "
+        f"{metadata}"
+    )
+
+
+def create_fetchers(db):
+    with db.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO metadata_fetcher (name, version, metadata)
+            VALUES (%s, %s, %s)
+            ON CONFLICT DO NOTHING
+            """,
+            (FETCHER.name, FETCHER.version, FETCHER.metadata),
+        )
+
+
+def main(storage_dbconn, storage_url, deposit_dbconn, first_id, dry_run):
+    storage_db = BaseDb.connect(storage_dbconn)
+    deposit_db = BaseDb.connect(deposit_dbconn)
+    storage = get_storage("remote", url=storage_url)
+
+    if not dry_run:
+        create_fetchers(storage_db)
+        # Not creating authorities, as the loaders are presumably already running
+        # and created them already.
+        # This also helps make sure this script doesn't accidentally create
+        # authorities that differ from what the loaders use.
+
+    total_rows = 0
+    with storage_db.cursor() as read_cur:
+        with deposit_db.cursor() as deposit_cur:
+            after_id = first_id
+            while True:
+                read_cur.execute(
+                    f"SELECT {', '.join(REVISION_COLS)} FROM revision "
+                    f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000",
+                    (after_id,),
+                )
+                new_rows = 0
+                for row in read_cur:
+                    row_d = dict(zip(REVISION_COLS, row))
+                    handle_row(row_d, storage, deposit_cur, dry_run)
+                    new_rows += 1
+
+                if new_rows == 0:
+                    break
+
+                after_id = row_d["id"]
+
+                total_rows += new_rows
+                percents = (
+                    int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32)
+                )
+                print(
+                    f"Migrated {total_rows/1000000.:.2f}M rows "
+                    f"(~{percents:.1f}%, last revision: {after_id.hex()})"
+                )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 4:
+        (_, storage_dbconn, storage_url, deposit_dbconn) = sys.argv
+        first_id = "00" * 20
+    elif len(sys.argv) == 5:
+        (_, storage_dbconn, storage_url, deposit_dbconn, first_id) = sys.argv
+    else:
+        print(
+            f"Syntax: {sys.argv[0]} <storage_dbconn> <storage_url> "
+            f"<deposit_dbconn> [<first id>]"
+        )
+        exit(1)
+
+    if os.path.isfile("./origins.txt"):
+        # You can generate this file with:
+        # psql service=swh-replica \
+        #   -c "\copy (select digest(url, 'sha1') from origin) to stdout" \
+        #   | pv -l > origins.txt
+        print("Loading origins...")
+        with open("./origins.txt") as fd:
+            for line in fd:
+                digest = line.strip()[3:]
+                _origins.add(bytes.fromhex(digest))
+        print("Done loading origins.")
+
+    main(storage_dbconn, storage_url, deposit_dbconn, bytes.fromhex(first_id), True)
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+
+
+def test_cran_package_from_url():
+    files = [
+        ("https://cran.r-project.org/src/contrib/shapeR_0.1-5.tar.gz", "shapeR"),
+        ("https://cran.r-project.org/src/contrib/hot.deck_1.1.tar.gz", "hot.deck"),
+    ]
+
+    for (filename, project) in files:
+        assert cran_package_from_url(filename) == project
+
+
+def test_cran():
+    source_original_artifacts = [
+        {
+            "length": 170623,
+            "filename": "ExtremeRisks_0.0.3.tar.gz",
+            "checksums": {
+                "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f",
+                "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9",
+            },
+        }
+    ]
+    dest_original_artifacts = [
+        {
+            "length": 170623,
+            "filename": "ExtremeRisks_0.0.3.tar.gz",
+            "checksums": {
+                "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f",
+                "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9",
+            },
+            "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz",
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x03a\xaa3\x84,\xbd\xea_\xa6\xe7}\xb6\x96\xb97\xeb\xd2i",
+        "date": datetime.datetime(2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc,),
+        "committer_date": datetime.datetime(
+            2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc,
+        ),
+        "type": "tar",
+        "message": b"0.0.3",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz",
+                    "version": "0.0.3",
+                },
+                "when": "2020-05-07T15:27:38.652281+00:00",
+                "provider": "https://cran.r-project.org/package=ExtremeRisks",
+            },
+            "intrinsic": {
+                "raw": {
+                    "URL": "mypage.unibocconi.it/simonepadoan/",
+                    "Date": "2020-05-05",
+                    "Title": "Extreme Risk Measures",
+                    "Author": "Simone Padoan [cre, aut],\n  Gilles Stupfler [aut]",
+                    # ...
+                    "Date/Publication": "2020-05-07 10:20:02 UTC",
+                },
+                "tool": "DESCRIPTION",
+            },
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    origin_url = "https://cran.r-project.org/package=ExtremeRisks"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(row, storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000361aa33842cbdea5fa6e77db696b937ebd269"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 5, 7, 15, 27, 38, 652281, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_cran_without_revision_date():
+    """Tests a CRAN revision with a date in the metadata but not as revision date"""
+    source_original_artifacts = [
+        {
+            "length": 8018,
+            "filename": "gofgamma_1.0.tar.gz",
+            "checksums": {
+                "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b",
+                "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03",
+            },
+        }
+    ]
+    dest_original_artifacts = [
+        {
+            "length": 8018,
+            "filename": "gofgamma_1.0.tar.gz",
+            "checksums": {
+                "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b",
+                "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03",
+            },
+            "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz",
+        }
+    ]
+
+    row = {
+        "id": b'\x00\x00\xd4\xef^\x16a"\xae\xe6\x86*\xd3\x8a\x18\xceS\x86\xcc>',
+        "date": None,
+        "committer_date": None,
+        "type": "tar",
+        "message": b"1.0",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz",
+                    "version": "1.0",
+                },
+                "when": "2020-04-30T11:01:57.832481+00:00",
+                "provider": "https://cran.r-project.org/package=gofgamma",
+            },
+            "intrinsic": {
+                "raw": {
+                    "Type": "Package",
+                    "Title": "Goodness-of-Fit Tests for the Gamma Distribution",
+                    "Author": "Lucas Butsch [aut],\n  Bruno Ebner [aut, cre],\n  Steffen Betsch [aut]",
+                    # ...
+                },
+                "tool": "DESCRIPTION",
+            },
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    origin_url = "https://cran.r-project.org/package=gofgamma"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0000d4ef5e166122aee6862ad38a18ce5386cc3e"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 4, 30, 11, 1, 57, 832481, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
@@ -0,0 +1,273 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+
+
+def test_debian_with_extrinsic():
+    dest_original_artifacts = [
+        {
+            "length": 2936,
+            "filename": "kalgebra_19.12.1-1.dsc",
+            "checksums": {
+                "sha1": "f869e9f1155b1ee6d28ae3b40060570152a358cd",
+                "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11",
+            },
+            "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+        },
+        {
+            "length": 1156408,
+            "filename": "kalgebra_19.12.1.orig.tar.xz",
+            "checksums": {
+                "sha1": "e496032962212983a5359aebadfe13c4026fd45c",
+                "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a",
+            },
+            "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz",
+        },
+        {
+            "length": 10044,
+            "filename": "kalgebra_19.12.1-1.debian.tar.xz",
+            "checksums": {
+                "sha1": "b518bfc2ac708b40577c595bd539faa8b84572db",
+                "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67",
+            },
+            "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz",
+        },
+        {
+            "length": 488,
+            "filename": "kalgebra_19.12.1.orig.tar.xz.asc",
+            "checksums": {
+                "sha1": "ff53a5c21c1aef2b9caa38a02fa3488f43df4c20",
+                "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd",
+            },
+            "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc",
+        },
+    ]
+
+    source_original_artifacts = [
+        {k: v for (k, v) in d.items() if k != "url"} for d in dest_original_artifacts
+    ]
+
+    row = {
+        "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+        "date": datetime.datetime(
+            2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc,
+        ),
+        "date_offset": 60,
+        "type": "dsc",
+        "message": b"Synthetic revision for Debian source package kalgebra version 4:19.12.1-1",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "id": 2718802,
+                    "name": "kalgebra",
+                    "files": {
+                        "kalgebra_19.12.1-1.dsc": {
+                            "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+                            "name": "kalgebra_19.12.1-1.dsc",
+                            "size": 2936,
+                            "md5sum": "fd28f604d4cc31a0a305543230f1622a",
+                            "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11",
+                        },
+                        "kalgebra_19.12.1.orig.tar.xz": {
+                            "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz",
+                            "name": "kalgebra_19.12.1.orig.tar.xz",
+                            "size": 1156408,
+                            "md5sum": "34e09ed152da762d53101ea33634712b",
+                            "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a",
+                        },
+                        "kalgebra_19.12.1-1.debian.tar.xz": {
+                            "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz",
+                            "name": "kalgebra_19.12.1-1.debian.tar.xz",
+                            "size": 10044,
+                            "md5sum": "4f639f36143898d97d044f273f038e58",
+                            "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67",
+                        },
+                        "kalgebra_19.12.1.orig.tar.xz.asc": {
+                            "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc",
+                            "name": "kalgebra_19.12.1.orig.tar.xz.asc",
+                            "size": 488,
+                            "md5sum": "3c29291e4e6f0c294de80feb8e9fce4c",
+                            "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd",
+                        },
+                    },
+                    "version": "4:19.12.1-1",
+                    "revision_id": None,
+                },
+                "when": "2020-01-27T19:32:03.925498+00:00",
+                "provider": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+            },
+            "intrinsic": {
+                "raw": {
+                    "name": "kalgebra",
+                    "version": "4:19.12.1-1",
+                    # ...
+                },
+                "tool": "dsc",
+            },
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    storage = Mock()
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0000036c311ef33a281b05688f6eadcfc0943aee"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                ),
+            ]
+        )
+    ]
+
+
+def test_debian_without_extrinsic():
+    source_original_artifacts = [
+        {
+            "name": "pymongo_1.10-1.dsc",
+            "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241",
+            "length": 99,
+            "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f",
+            "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b",
+        },
+        {
+            "name": "pymongo_1.10.orig.tar.gz",
+            "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3",
+            "length": 99,
+            "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f",
+            "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad",
+        },
+        {
+            "name": "pymongo_1.10-1.debian.tar.gz",
+            "sha1": "fbf378296613c8d55e043aec98896b3e50a94971",
+            "length": 99,
+            "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513",
+            "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec",
+        },
+    ]
+
+    dest_original_artifacts = [
+        {
+            "length": 99,
+            "filename": "pymongo_1.10-1.dsc",
+            "checksums": {
+                "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241",
+                "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f",
+                "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b",
+            },
+        },
+        {
+            "length": 99,
+            "filename": "pymongo_1.10.orig.tar.gz",
+            "checksums": {
+                "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3",
+                "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f",
+                "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad",
+            },
+        },
+        {
+            "length": 99,
+            "filename": "pymongo_1.10-1.debian.tar.gz",
+            "checksums": {
+                "sha1": "fbf378296613c8d55e043aec98896b3e50a94971",
+                "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513",
+                "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec",
+            },
+        },
+    ]
+
+    row = {
+        "id": b"\x00\x00\x01\xc2\x8c\x8f\xca\x01\xb9\x04\xde\x92\xa2d\n\x86l\xe0<\xb7",
+        "date": datetime.datetime(
+            2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc
+        ),
+        "date_offset": 0,
+        "type": "dsc",
+        "message": b"Synthetic revision for Debian source package pymongo version 1.10-1",
+        "metadata": {
+            "package_info": {
+                "name": "pymongo",
+                "version": "1.10-1",
+                "changelog": {
+                    # ...
+                },
+                "maintainers": [
+                    {"name": "Federico Ceratto", "email": "federico.ceratto@gmail.com"},
+                    {"name": "Janos Guljas", "email": "janos@resenje.org"},
+                ],
+                "pgp_signature": {
+                    "date": "2011-03-31T21:02:44+00:00",
+                    "keyid": "2BABC6254E66E7B8450AC3E1E6AA90171392B174",
+                    "person": {"name": "David Paleino", "email": "d.paleino@gmail.com"},
+                },
+                "lister_metadata": {"id": 244296, "lister": "snapshot.debian.org"},
+            },
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    storage = Mock()
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000001c28c8fca01b904de92a2640a866ce03cb7"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                ),
+            ]
+        )
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
@@ -0,0 +1,1111 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock, MagicMock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+    DEPOSIT_COLS,
+    handle_row,
+    cran_package_from_url,
+)
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+SWH_DEPOSIT_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.DEPOSIT_CLIENT,
+    url="https://www.softwareheritage.org",
+    metadata={},
+)
+SWH_FORGE_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.DEPOSIT_CLIENT,
+    url="https://www.softwareheritage.org",
+    metadata={},
+)
+HAL_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.DEPOSIT_CLIENT,
+    url="https://hal.archives-ouvertes.fr/",
+    metadata={},
+)
+INTEL_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.DEPOSIT_CLIENT,
+    url="https://software.intel.com",
+    metadata={},
+)
+
+
+def get_mock_deposit_cur(row_dicts):
+    rows = [tuple(d[key] for key in DEPOSIT_COLS) for d in row_dicts]
+    deposit_cur = MagicMock()
+    deposit_cur.__iter__.side_effect = [iter(rows)]
+    return deposit_cur
+
+
+def test_deposit_1():
+    extrinsic_metadata = {
+        "title": "Je suis GPL",
+        "@xmlns": "http://www.w3.org/2005/Atom",
+        "client": "swh",
+        "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/",
+        "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+        "codemeta:author": {
+            "codemeta:name": "Stefano Zacchiroli",
+            "codemeta:jobTitle": "Maintainer",
+        },
+        "codemeta:license": {
+            "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html",
+            "codemeta:name": "GNU General Public License v3.0 or later",
+        },
+        # ...
+    }
+    original_artifacts = [
+        {
+            "length": 80880,
+            "filename": "archive.zip",
+            "checksums": {
+                "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71",
+                "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x02#\x10\xdf\x16\xfd\x9eMO\x81\xfe6\xa1B\xe8-\xb9w\xc0\x1d",
+        "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"swh: Deposit 467 in collection swh",
+        "metadata": {
+            "client": "swh",
+            "extrinsic": {
+                "raw": {
+                    "origin": {
+                        "url": "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476",
+                        "type": "deposit",
+                    },
+                    "branch_name": "master",
+                    "origin_metadata": {
+                        "tool": {
+                            "name": "swh-deposit",
+                            "version": "0.0.1",
+                            "configuration": {"sword_version": 2},
+                        },
+                        "metadata": extrinsic_metadata,
+                    },
+                },
+                "when": "2020-03-11T11:11:36.336283+00:00",
+                "provider": "https://deposit.softwareheritage.org/1/private/467/meta/",
+            },
+            "original_artifact": original_artifacts,
+            **extrinsic_metadata,
+        },
+    }
+
+    origin_url = (
+        "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476"
+    )
+
+    swhid = (
+        f"swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+        f";origin={origin_url}"
+        f";visit=swh:1:snp:14433c19dbb03ad57c86b58b53a800d6a0e32dd3"
+        f";anchor=swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+        f";path=/"
+    )
+
+    deposit_rows = [
+        {
+            "deposit.id": 467,
+            "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://www.softwareheritage.org",
+            "deposit_collection.name": "swh",
+            "auth_user.username": "swh",
+        },
+        {
+            "deposit.id": 467,
+            "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2020, 3, 11, 11, 7, 18, 669428, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://www.softwareheritage.org",
+            "deposit_collection.name": "swh",
+            "auth_user.username": "swh",
+        },
+    ]
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_DEPOSIT_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 3, 11, 11, 11, 36, 336283, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_deposit_2():
+    extrinsic_metadata = {
+        "{http://www.w3.org/2005/Atom}id": "hal-01243573",
+        "{http://www.w3.org/2005/Atom}author": {
+            "{http://www.w3.org/2005/Atom}name": "HAL",
+            "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+        },
+        "{http://www.w3.org/2005/Atom}client": "hal",
+        "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter"
+        },
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license": {
+            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "GNU General Public License v3.0 or later"
+        },
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}version": 1,
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifier": "10.5281/zenodo.438684",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}dateCreated": "2017-11-16T14:54:23+01:00",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}description": "Project in OR: The assignment problem A java implementation for the assignment problem first release",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}codeRepository": "https://github.com/moranegg/AffectationRO",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}operatingSystem": "debian",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform": "outil",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}softwareVersion": "1.0.0",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}developmentStatus": "etat",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}applicationCategory": "info.info-ro",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage": "AMPL",
+    }
+    original_artifacts = [
+        {
+            "length": 208357,
+            "filename": "archive.zip",
+            "checksums": {
+                "sha1": "fa0aec08e8a44ea144dba7ce366c8b5d66c14453",
+                "sha256": "f53c05fe947e88ce83751a93bd522b1f88478ea2e7b984c07fc7a7c68128bf87",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x01\x16\xca\xb7\x19d\xd5\x9c\x85p\xb4\xc5r\x9b(\xbd\xd6<\x9bF",
+        "date": datetime.datetime(
+            2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"hal: Deposit 82 in collection hal",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "origin": {
+                        "url": "https://hal.archives-ouvertes.fr/hal-01243573",
+                        "type": "deposit",
+                    },
+                    "origin_metadata": {
+                        "tool": {
+                            "name": "swh-deposit",
+                            "version": "0.0.1",
+                            "configuration": {"sword_version": 2},
+                        },
+                        "metadata": extrinsic_metadata,
+                        "provider": {
+                            "metadata": {},
+                            "provider_url": "https://hal.archives-ouvertes.fr/",
+                            "provider_name": "hal",
+                            "provider_type": "deposit_client",
+                        },
+                    },
+                },
+                "when": "2020-05-15T14:27:21.462270+00:00",
+                "provider": "https://deposit.softwareheritage.org/1/private/82/meta/",
+            },
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    swhid = (
+        "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50"
+        ";origin=https://hal.archives-ouvertes.fr/hal-01243573"
+        ";visit=swh:1:snp:abc9ae594245a740235b6c039f044352a5f723ec"
+        ";anchor=swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+        ";path=/"
+    )
+
+    deposit_rows = [
+        {
+            "deposit.id": 82,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 17, 12, 54, 1, 533972, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+        {
+            "deposit.id": 82,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+    ]
+
+    origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=HAL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 5, 15, 14, 27, 21, 462270, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_deposit_3_and_wrong_external_id_in_metadata():
+    extrinsic_metadata = {
+        "title": "VTune Perf tool",
+        "@xmlns": "http://www.w3.org/2005/Atom",
+        "client": "swh",
+        "codemeta:url": "https://software.intel.com/en-us/vtune",
+        "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+        "codemeta:author": {
+            "codemeta:name": "VTune developer",
+            "codemeta:jobTitle": "Software Engineer",
+        },
+        "codemeta:license": {
+            "codemeta:url": "https://spdx.org/licenses/GPL-2.0.html",
+            "codemeta:name": "GNU General Public License v2.0",
+        },
+        "codemeta:version": "4.3.1.33",
+        "external_identifier": "vtune-perf-tool",
+        "codemeta:dateCreated": "2019-05-14",
+        "codemeta:description": "Modified version of Linux Perf tool which is used by Intel VTune Amplifier",
+        "codemeta:runtimePlatform": "GNU/Linux",
+        "codemeta:developmentStatus": "stable",
+        "codemeta:programmingLanguage": "C",
+    }
+    source_original_artifacts = [
+        {
+            "name": "archive.zip",
+            "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b",
+            "length": 4350528,
+            "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6",
+            "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429",
+            "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d",
+            "archive_type": "zip",
+        }
+    ]
+    dest_original_artifacts = [
+        {
+            "length": 4350528,
+            "archive_type": "zip",
+            "filename": "archive.zip",
+            "checksums": {
+                "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b",
+                "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6",
+                "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429",
+                "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\t5`S\xc4\x9a\xd0\xf9\xe6.Q\xc2\x9d>a|y\x11@\xdf",
+        "date": datetime.datetime(2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"intel: Deposit 268 in collection intel",
+        "metadata": {
+            **extrinsic_metadata,
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    swhid = (
+        "swh:1:dir:527c8e4a67d391f2bf1bbc86dd94af5d5cfc8ef7"
+        ";origin=https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff"
+        ";visit=swh:1:snp:49d60943d9c061da1aba6266a811412f9db8de2e"
+        ";anchor=swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+        ";path=/"
+    )
+    deposit_rows = [
+        {
+            "deposit.id": 268,
+            "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://software.intel.com",
+            "deposit_collection.name": "intel",
+            "auth_user.username": "intel",
+        },
+        {
+            "deposit.id": 268,
+            "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2019, 5, 14, 7, 49, 36, 477061, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://software.intel.com",
+            "deposit_collection.name": "intel",
+            "auth_user.username": "intel",
+        },
+        {
+            "deposit.id": 268,
+            "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://software.intel.com",
+            "deposit_collection.name": "intel",
+            "auth_user.username": "intel",
+        },
+        {
+            "deposit.id": 268,
+            "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2019, 5, 14, 7, 28, 33, 41454, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://software.intel.com",
+            "deposit_collection.name": "intel",
+            "auth_user.username": "intel",
+        },
+    ]
+
+    origin_url = "https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=INTEL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=INTEL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_deposit_2_with_xmlns():
+    extrinsic_metadata = {
+        "title": "Je suis GPL",
+        "@xmlns": "http://www.w3.org/2005/Atom",
+        "client": "swh",
+        "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/",
+        "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+        "codemeta:author": {
+            "codemeta:name": "Stefano Zacchiroli",
+            "codemeta:jobTitle": "Maintainer",
+        },
+        "codemeta:license": {
+            "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html",
+            "codemeta:name": "GNU General Public License v3.0 or later",
+        },
+        "codemeta:version": "0.1",
+        "external_identifier": "je-suis-gpl",
+        "codemeta:dateCreated": "2018-01-05",
+        "codemeta:description": "Je suis GPL is a modified version of GNU Hello whose\n    sole purpose is to showcase the usage of\n    Software Heritage for license compliance purposes.",
+        "codemeta:runtimePlatform": "GNU/Linux",
+        "codemeta:developmentStatus": "stable",
+        "codemeta:programmingLanguage": "C",
+    }
+    original_artifacts = [
+        {
+            "length": 80880,
+            "filename": "archive.zip",
+            "checksums": {
+                "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71",
+                "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4",
+            },
+        }
+    ]
+
+    row = {
+        "id": b'\x01"\x96nP\x93\x17\xae\xcejA\xd0\xf0\x88\xdas<\xc0\x9d\x0f',
+        "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"swh: Deposit 687 in collection swh",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "origin": {
+                        "url": "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420",
+                        "type": "deposit",
+                    },
+                    "origin_metadata": {
+                        "tool": {
+                            "name": "swh-deposit",
+                            "version": "0.0.1",
+                            "configuration": {"sword_version": 2},
+                        },
+                        "metadata": extrinsic_metadata,
+                        "provider": {
+                            "metadata": {},
+                            "provider_url": "https://www.softwareheritage.org",
+                            "provider_name": "swh",
+                            "provider_type": "deposit_client",
+                        },
+                    },
+                },
+                "when": "2020-06-26T13:50:22.640625+00:00",
+                "provider": "https://deposit.softwareheritage.org/1/private/687/meta/",
+            },
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    swhid = (
+        "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+        ";origin=https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420"
+        ";visit=swh:1:snp:8fd469e280fb0724175c64906627f619143d5bdb"
+        ";anchor=swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+        ";path=/"
+    )
+    deposit_rows = [
+        {
+            "deposit.id": 687,
+            "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://www.softwareheritage.org",
+            "deposit_collection.name": "swh",
+            "auth_user.username": "swh",
+        },
+        {
+            "deposit.id": 687,
+            "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2020, 6, 26, 13, 50, 8, 150498, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://www.softwareheritage.org",
+            "deposit_collection.name": "swh",
+            "auth_user.username": "swh",
+        },
+    ]
+
+    origin_url = (
+        "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420"
+    )
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_FORGE_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 6, 26, 13, 50, 22, 640625, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_deposit_5_without_xmlns():
+    extrinsic_metadata = {
+        "{http://www.w3.org/2005/Atom}id": "hal-01243573",
+        "{http://www.w3.org/2005/Atom}author": {
+            "{http://www.w3.org/2005/Atom}name": "HAL",
+            "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+        },
+        "{http://www.w3.org/2005/Atom}client": "hal",
+        "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter"
+        },
+        # ...
+    }
+
+    row = {
+        "id": b"\x03\x98\x7f\x05n\xafE\x96\xcd \xd7\xb2\xee\x01\xc9\xb8L\xed\xdf\xa8",
+        "date": datetime.datetime(
+            2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b": Deposit 79 in collection hal",
+        "metadata": extrinsic_metadata,
+    }
+
+    swhid = (
+        "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50"
+        ";origin=https://hal.archives-ouvertes.fr/hal-01243573"
+        ";visit=swh:1:snp:c31851534c86676a040fb10f438728c90f1c9d55"
+        ";anchor=swh:1:rev:43549ebbe70c9cdf0be1647e6319392eaa06f3a3"
+        ";path=/"
+    )
+    deposit_rows = [
+        {
+            "deposit.id": 79,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 17, 12, 49, 31, 208347, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+        {
+            "deposit.id": 79,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+    ]
+
+    origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:03987f056eaf4596cd20d7b2ee01c9b84ceddfa8"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=HAL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        # note: no original artifacts
+    ]
+
+
+def test_deposit_5_wrong_origin():
+    extrinsic_metadata = {
+        "{http://www.w3.org/2005/Atom}id": "hal-01588781",
+        "{http://www.w3.org/2005/Atom}author": {
+            "{http://www.w3.org/2005/Atom}name": "HAL",
+            "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+        },
+        "{http://www.w3.org/2005/Atom}client": "hal",
+        "{http://www.w3.org/2005/Atom}external_identifier": "hal-01588781",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem ",
+        "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter",
+            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation": "Initiative pour la Recherche et l'Innovation sur le Logiciel Libre",
+        },
+        # ...
+    }
+
+    row = {
+        "id": b"-{\xcec\x1f\xc7\x91\x08\x03\x11\xeb\x83\\GB\x8eXjn\xa4",
+        "date": datetime.datetime(
+            2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b": Deposit 75 in collection hal",
+        "metadata": extrinsic_metadata,
+    }
+
+    swhid = (
+        "swh:1:dir:d8971c651fe256942aa4499a3ccdbaa305d3bade"
+        ";origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+        ";visit=swh:1:snp:7c70cc8ea5b79e376605fd6e9b3b04d98861ffc0"
+        ";anchor=swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4"
+        ";path=/"
+    )
+    deposit_rows = [
+        {
+            "deposit.id": 75,
+            "deposit.external_id": "hal-01588781",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 10, 13, 14, 51, 523963, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+        {
+            "deposit.id": 75,
+            "deposit.external_id": "hal-01588781",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+    ]
+
+    origin_url = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=HAL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        # note: no original artifacts
+    ]
+
+
+def test_deposit_missing_metadata_in_revision():
+    extrinsic_metadata = {
+        "id": "hal-01243573",
+        "@xmlns": "http://www.w3.org/2005/Atom",
+        "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"},
+        "client": "hal",
+        "committer": "Administrateur Du Ccsd",
+        "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+        "codemeta:name": "The assignment problem",
+        "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+        "codemeta:author": {"codemeta:name": "Morane Gruenpeter"},
+        "codemeta:license": {
+            "codemeta:name": "GNU General Public License v3.0 or later"
+        },
+        "codemeta:version": "1",
+        "codemeta:identifier": {"#text": "10.5281/zenodo.438684", "@name": "doi",},
+        "external_identifier": "hal-01243573",
+        "codemeta:dateCreated": "2017-11-16T14:54:23+01:00",
+        "codemeta:description": "Project in OR: The assignment problem A java implementation for the assignment problem first release",
+        "codemeta:codeRepository": "https://github.com/moranegg/AffectationRO",
+        "codemeta:operatingSystem": "debian",
+        "codemeta:runtimePlatform": "outil",
+        "codemeta:softwareVersion": "1.0.0",
+        "codemeta:developmentStatus": "etat",
+        "codemeta:applicationCategory": ["info", "info.info-ro"],
+        "codemeta:programmingLanguage": ["java", "AMPL"],
+    }
+    source_original_artifacts = [
+        {
+            "name": "archive.zip",
+            "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a",
+            "length": 118650,
+            "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5",
+            "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79",
+            "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295",
+            "archive_type": "zip",
+        }
+    ]
+    dest_original_artifacts = [
+        {
+            "length": 118650,
+            "archive_type": "zip",
+            "filename": "archive.zip",
+            "checksums": {
+                "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a",
+                "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5",
+                "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79",
+                "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x03@v\xf3\xf4\x1e\xe1 N\xb9\xf6@\x82\xcb\xe6\xe9P\xd7\xbb\x8a",
+        "date": datetime.datetime(
+            2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"hal: Deposit 229 in collection hal",
+        "metadata": {"original_artifact": source_original_artifacts},
+    }
+
+    swhid = (
+        "swh:1:dir:3d65b6f065118cb856272829b459f0dfa55549aa"
+        ";origin=https://hal-test.archives-ouvertes.fr/hal-01243573"
+        ";visit=swh:1:snp:322c54ff4023d3216a994bc9ff9ee524ed80ee1f"
+        ";anchor=swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+        ";path=/"
+    )
+    deposit_rows = [
+        {
+            "deposit.id": 229,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": None,
+            "deposit_request.date": datetime.datetime(
+                2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+        {
+            "deposit.id": 229,
+            "deposit.external_id": "hal-01243573",
+            "deposit.swh_id_context": swhid,
+            "deposit_request.metadata": extrinsic_metadata,
+            "deposit_request.date": datetime.datetime(
+                2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc
+            ),
+            "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+            "deposit_collection.name": "hal",
+            "auth_user.username": "hal",
+        },
+    ]
+
+    origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+    # /!\ not https://hal-test.archives-ouvertes.fr/hal-01243573
+    #     do not trust the metadata!
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = get_mock_deposit_cur(deposit_rows)
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    deposit_cur.execute.assert_called_once()
+    deposit_cur.__iter__.assert_called_once()
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=HAL_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="sword-v2-atom-codemeta-v2-in-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py
@@ -0,0 +1,108 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+
+
+def test_gnu():
+    original_artifacts = [
+        {
+            "length": 842501,
+            "filename": "gperf-3.0.1.tar.gz",
+            "checksums": {
+                "sha1": "c4453ee492032b369006ee464f4dd4e2c0c0e650",
+                "sha256": "5be283ef62e1bd26abdaaf88b416dbea4b14c360b09befcda2f055656dc43f87",
+                "sha1_git": "bf1d5bb57d571101dd7b6acab2b78ae11bb861de",
+                "blake2s256": "661f84afeb1e0b914defe2b249d424af1dfe380a96016b3282ae758c70e19a70",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x1cqE\x8e@[%\xba\xcc\xc8\x0b\x99\xf6cM\xff\x9d+\x18",
+        "date": datetime.datetime(2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"swh-loader-package: synthetic revision message",
+        "metadata": {
+            "extrinsic": {
+                "raw": {
+                    "url": "https://ftp.gnu.org/gnu/gperf/gperf-3.0.1.tar.gz",
+                    "time": "2003-06-13T00:11:00+00:00",
+                    "length": 842501,
+                    "version": "3.0.1",
+                    "filename": "gperf-3.0.1.tar.gz",
+                },
+                "when": "2019-11-27T11:17:38.318997+00:00",
+                "provider": "https://ftp.gnu.org/gnu/gperf/",
+            },
+            "intrinsic": {},
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    origin_url = "https://ftp.gnu.org/gnu/gperf/"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(row, storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:001c71458e405b25baccc80b99f6634dff9d2b18"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 11, 27, 11, 17, 38, 318997, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+NIX_UNSTABLE_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.FORGE,
+    url="https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
+    metadata={},
+)
+
+
+def test_nixguix():
+    extrinsic_metadata = {
+        "url": "https://files.pythonhosted.org/packages/source/a/alerta/alerta-7.4.5.tar.gz",
+        "integrity": "sha256-km8RAaG1ep+tYR8eHVr3UWk+/MNEqdsBr1Di/g02LYQ=",
+    }
+    original_artifacts = [
+        {
+            "length": 34903,
+            "filename": "alerta-7.4.5.tar.gz",
+            "checksums": {
+                "sha1": "66db4398b664de272fd5aa6610caa776b5e64651",
+                "sha256": "926f1101a1b57a9fad611f1e1d5af751693efcc344a9db01af50e2fe0d362d84",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x01\xbaM\xd0S\x94\x85\x02\x11\xd7\xb3\x85M\x99\x13\xd2:\xe3y",
+        "date": None,
+        "committer_date": None,
+        "type": "tar",
+        "message": b"",
+        "metadata": {
+            "extrinsic": {
+                "raw": extrinsic_metadata,
+                "when": "2020-06-03T11:25:05.259341+00:00",
+                "provider": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
+            },
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    origin_url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(row, storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=NIX_UNSTABLE_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="nixguix-sources-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py
@@ -0,0 +1,376 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+    handle_row,
+    npm_package_from_source_url,
+)
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+PYPI_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={},
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+
+
+def test_npm_package_from_source_url():
+    package_urls = [
+        (
+            "@l3ilkojr/jdinsults",
+            "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz",
+        ),
+        ("simplemaps", "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz"),
+        (
+            "@piximi/components",
+            "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+        ),
+        (
+            "@chappa'ai/get-next-rc",
+            "https://registry.npmjs.org/@chappa%27ai/get-next-rc/-/get-next-rc-1.0.0.tgz",
+        ),
+    ]
+
+    for (package_name, source_url) in package_urls:
+        assert npm_package_from_source_url(source_url) == package_name
+
+
+def test_npm_1():
+    """Tests loading a revision generated by a new NPM loader that
+    has a provider."""
+
+    extrinsic_metadata = {
+        "_id": "@l3ilkojr/jdinsults@3.0.0",
+        "dist": {
+            "shasum": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8",
+            "tarball": "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz",
+            "fileCount": 4,
+            "integrity": "sha512-qpv8Zg51g0l51VjODEooMUGSGanGUuQpzX5msfR7ZzbgTsgPbpDNyTIsQ0wQzI9RzCCUjS84Ii2VhMISEQcEUA==",
+            "unpackedSize": 1583,
+            "npm-signature": "-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.4\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJeUMS5CRA9TVsSAnZWagAAXpgP/0YgNOWN0U/Fz2RGeQhR\nVIKPvfGqZ2UfFxxUXWIc4QHvwyLCNUedCctpVdqnqmGJ9m/hj3K2zbRPD7Tm\n3nPl0HfzE7v3T8TDZfGhzW3c9mWxig+syr+sjo0EKyAgZVJ0mxbjOl4KHt+U\nQEwl/4falBsyYtK/pkCXWmmuC606QmPn/c6ZRD1Fw4vJjT9i5qi1KaBkIf6M\nnFmpOFxTcwxGGltOk3s3TKDtr8CIeWmdm3VkgsP2ErkPKAOcu12AT4/5tkg0\nDU+m1XmJb67rskb4Ncjvic/VutnPkEfNrk1IRXrmjDZBQbHtCJ7hd5ETmb9S\nE5WmMV8cpaGiW7AZvGTmkn5WETwQQU7po914zYiMg9+ozdwc7yC8cpGj/UoF\niKxsc1uxdfwWk/p3dShegEYM7sveloIXYsPaxbd84WRIfnwkWFZV82op96E3\neX+FRkhMfsHlK8OjZsBPXkppaB48jnZdm3GOOzT9YgyphV33j3J9GnNcDMDe\nriyCLV1BNSKDHElCDrvl1cBGg+C5qn/cTYjQdfEPPY2Hl2MgW9s4UV2s+YSx\n0BBd2A3j80wncP+Y7HFeC4Pv0SM0Pdq6xJaf3ELhj6j0rVZeTW1O3E/PFLXK\nnn/DZcsFXgIzjY+eBIMQgAhqyeJve8LeQNnGt3iNW10E2nZMpfc+dn0ESiwV\n2Gw4\r\n=8uqZ\r\n-----END PGP SIGNATURE-----\r\n",
+        },
+        "name": "@l3ilkojr/jdinsults",
+        "version": "3.0.0",
+        "_npmUser": {"name": "l3ilkojr", "email": "l3ilkojr@example.com"},
+        "_npmVersion": "6.13.6",
+        "description": "Generates insults",
+        "directories": {},
+        "maintainers": [{"name": "l3ilkojr", "email": "l3ilkojr@example.com"}],
+        "_nodeVersion": "10.14.0",
+        "_hasShrinkwrap": False,
+        "_npmOperationalInternal": {
+            "tmp": "tmp/jdinsults_3.0.0_1582351545285_0.2614827716102821",
+            "host": "s3://npm-registry-packages",
+        },
+    }
+
+    original_artifacts = [
+        {
+            "length": 1033,
+            "filename": "jdinsults-3.0.0.tgz",
+            "checksums": {
+                "sha1": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8",
+                "sha256": "42f22795ac883b02fded0b2bf3d8a77f6507d40bc67f28eea6b1b73eb59c515f",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x00\x02\xa4\x9b\xba\x17\xca\x8c\xf3\x7f_=\x16\xaa\xac\xf9S`\xfc",
+        "date": datetime.datetime(2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"3.0.0",
+        "metadata": {
+            "extrinsic": {
+                "raw": extrinsic_metadata,
+                "when": "2020-02-27T01:35:47.965375+00:00",
+                "provider": "https://replicate.npmjs.com/%40l3ilkojr%2Fjdinsults/",
+            },
+            "intrinsic": {
+                "raw": {"name": "@l3ilkojr/jdinsults", "version": "3.0.0"},
+                "tool": "package.json",
+            },
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    origin_url = "https://www.npmjs.com/package/@l3ilkojr/jdinsults"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=PYPI_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="replicate-npm-package-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_npm_2_unscoped():
+    """Tests loading a revision generated by an old NPM loader that doesn't
+    have a provider; and the package name is unscoped (ie. doesn't contain a
+    slash)."""
+
+    extrinsic_metadata = {
+        "bugs": {"url": "https://github.com/niwasawa/simplemaps/issues"},
+        "name": "simplemaps",
+        "author": "Naoki Iwasawa",
+        "license": "MIT",
+        # ...
+    }
+
+    package_source = {
+        "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz",
+        "date": "2016-12-23T07:21:29.733Z",
+        "name": "simplemaps",
+        "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4",
+        "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b",
+        "version": "0.0.6",
+        "filename": "simplemaps-0.0.6.tgz",
+        "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462",
+    }
+
+    original_artifacts = [
+        {
+            "filename": "simplemaps-0.0.6.tgz",
+            "checksums": {
+                "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4",
+                "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b",
+                "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462",
+            },
+            "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz",
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x00\x04\xae\xed\t\xee\x08\x9cx\x12d\xc0M%d\xfdX\xfe\xb5",
+        "date": datetime.datetime(
+            2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"0.0.6",
+        "metadata": {"package": extrinsic_metadata, "package_source": package_source,},
+    }
+
+    origin_url = "https://www.npmjs.com/package/simplemaps"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=PYPI_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="replicate-npm-package-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_npm_2_scoped():
+    """Tests loading a revision generated by an old NPM loader that doesn't
+    have a provider; and the package name is scoped (ie. in the format
+    @org/name)."""
+
+    extrinsic_metadata = {
+        "bugs": {"url": "https://github.com/piximi/components/issues"},
+        "name": "@piximi/components",
+        # ...
+    }
+
+    package_source = {
+        "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+        "date": "2019-06-07T19:56:04.753Z",
+        "name": "@piximi/components",
+        "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec",
+        "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f",
+        "version": "0.1.11",
+        "filename": "components-0.1.11.tgz",
+        "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd",
+    }
+
+    original_artifacts = [
+        {
+            "filename": "components-0.1.11.tgz",
+            "checksums": {
+                "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec",
+                "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f",
+                "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd",
+            },
+            "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x00 \x19\xc5wXt\xbc\xed\x00zR\x9b\xd3\xb7\x8b\xf6\x04W",
+        "date": datetime.datetime(2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"0.1.11",
+        "metadata": {"package": extrinsic_metadata, "package_source": package_source,},
+    }
+
+    origin_url = "https://www.npmjs.com/package/@piximi/components"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=PYPI_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="replicate-npm-package-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
@@ -0,0 +1,349 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MetadataTargetType,
+    Origin,
+    RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+    handle_row,
+    pypi_project_from_filename,
+)
+
+
+FETCHER = MetadataFetcher(
+    name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+PYPI_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={},
+)
+SWH_AUTHORITY = MetadataAuthority(
+    type=MetadataAuthorityType.REGISTRY,
+    url="https://softwareheritage.org/",
+    metadata={},
+)
+
+
+def test_pypi_project_from_filename():
+    files = [
+        ("django-agent-trust-0.1.8.tar.gz", "django-agent-trust"),
+        ("python_test-1.0.1.zip", "python_test"),
+        ("py-evm-0.2.0a9.tar.gz", "py-evm"),
+        ("collective.texttospeech-1.0rc1.tar.gz", "collective.texttospeech"),
+        ("flatland-fork-0.4.post1.dev40550160.zip", "flatland-fork"),
+    ]
+
+    for (filename, project) in files:
+        assert pypi_project_from_filename(filename) == project
+
+
+def test_pypi_1():
+    extrinsic_metadata = {
+        "url": "https://files.pythonhosted.org/packages/70/89/a498245baf1bf3dde73d3da00b4b067a8aa7c7378ad83472078803ea3e43/m3-ui-2.2.73.tar.gz",
+        "size": 3933168,
+        "digests": {
+            "md5": "a374ac3f655e97df5db5335e2142d344",
+            "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
+        },
+        "has_sig": False,
+        "filename": "m3-ui-2.2.73.tar.gz",
+        "downloads": -1,
+        "md5_digest": "a374ac3f655e97df5db5335e2142d344",
+        "packagetype": "sdist",
+        "upload_time": "2019-11-11T06:21:20",
+        "comment_text": "",
+        "python_version": "source",
+        "requires_python": None,
+        "upload_time_iso_8601": "2019-11-11T06:21:20.073082Z",
+    }
+
+    original_artifacts = [
+        {
+            "length": 3933168,
+            "filename": "m3-ui-2.2.73.tar.gz",
+            "checksums": {
+                "sha1": "9f4ec7ce64b7fea4b122e85d47ea31146c367b03",
+                "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x00\x07a{S\xe7\xb1E\x8fi]\xd0}\xe4\xceU\xaf\x15\x17",
+        "date": datetime.datetime(
+            2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc,
+        ),
+        "committer_date": datetime.datetime(
+            2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc,
+        ),
+        "type": "tar",
+        "message": b"2.2.73",
+        "metadata": {
+            "extrinsic": {
+                "raw": extrinsic_metadata,
+                "when": "2020-01-23T18:43:09.109407+00:00",
+                "provider": "https://pypi.org/pypi/m3-ui/json",
+            },
+            "intrinsic": {
+                "raw": {
+                    "name": "m3-ui",
+                    "summary": "======",
+                    "version": "2.2.73",
+                    # ...
+                    "metadata_version": "1.1",
+                },
+                "tool": "PKG-INFO",
+            },
+            "original_artifact": original_artifacts,
+        },
+    }
+
+    origin_url = "https://pypi.org/project/m3-ui/"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.origin_get([origin_url]),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=PYPI_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="pypi-project-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(original_artifacts).encode(),
+                    origin=origin_url,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_pypi_2():
+    extrinsic_metadata = {
+        "name": "jupyterhub-simx",
+        "author": "Jupyter Development Team",
+        "license": "BSD",
+        "summary": "JupyterHub: A multi-user server for Jupyter notebooks",
+        "version": "1.0.5",
+        # ...
+    }
+
+    source_original_artifacts = [
+        {
+            "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
+            "date": "2019-01-23T22:10:55",
+            "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
+            "size": 2346538,
+            "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
+            "filename": "jupyterhub-simx-1.0.5.tar.gz",
+            "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02",
+            "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
+            "archive_type": "tar",
+        }
+    ]
+
+    dest_original_artifacts = [
+        {
+            "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
+            "date": "2019-01-23T22:10:55",
+            "filename": "jupyterhub-simx-1.0.5.tar.gz",
+            "archive_type": "tar",
+            "length": 2346538,
+            "checksums": {
+                "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
+                "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
+                "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02",
+                "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"\x00\x00\x04\xd68,J\xd4\xc0Q\x92fbl6U\x1f\x0eQ\xca",
+        "date": datetime.datetime(
+            2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc
+        ),
+        "committer_date": datetime.datetime(
+            2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"1.0.5",
+        "metadata": {
+            "project": extrinsic_metadata,
+            "original_artifact": source_original_artifacts,
+        },
+    }
+
+    origin_url = "https://pypi.org/project/jupyterhub-simx/"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=PYPI_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="pypi-project-json",
+                    metadata=json.dumps(extrinsic_metadata).encode(),
+                    origin=None,
+                ),
+            ]
+        ),
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=None,
+                ),
+            ]
+        ),
+    ]
+
+
+def test_pypi_3():
+    source_original_artifact = {
+        "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
+        "date": "2014-05-07T22:03:00",
+        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
+        "size": 46644,
+        "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
+        "filename": "PyPDFLite-0.1.32.tar.gz",
+        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
+        "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
+        "archive_type": "tar",
+    }
+
+    dest_original_artifacts = [
+        {
+            "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
+            "date": "2014-05-07T22:03:00",
+            "filename": "PyPDFLite-0.1.32.tar.gz",
+            "archive_type": "tar",
+            "length": 46644,
+            "checksums": {
+                "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
+                "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
+                "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
+                "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
+            },
+        }
+    ]
+
+    row = {
+        "id": b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2",
+        "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
+        "committer_date": datetime.datetime(
+            2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc
+        ),
+        "type": "tar",
+        "message": b"0.1.32",
+        "metadata": {"original_artifact": source_original_artifact},
+    }
+
+    origin_url = "https://pypi.org/project/PyPDFLite/"
+
+    storage = Mock()
+
+    def origin_get(urls):
+        assert urls == [origin_url]
+        return [Origin(url=origin_url)]
+
+    storage.origin_get.side_effect = origin_get
+    deposit_cur = None
+    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+    assert storage.method_calls == [
+        call.raw_extrinsic_metadata_add(
+            [
+                RawExtrinsicMetadata(
+                    type=MetadataTargetType.REVISION,
+                    id=parse_swhid(
+                        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2"
+                    ),
+                    discovery_date=datetime.datetime(
+                        2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc,
+                    ),
+                    authority=SWH_AUTHORITY,
+                    fetcher=FETCHER,
+                    format="original-artifacts-json",
+                    metadata=json.dumps(dest_original_artifacts).encode(),
+                    origin=None,
+                ),
+            ]
+        ),
+    ]