diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
index e02467f..aa23794 100644
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -1,139 +1,123 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import iso8601
 import logging
 
 from os import path
 from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
 
 from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import release_name
+from swh.loader.package.utils import release_name, artifact_identity
 from swh.model.identifiers import normalize_timestamp
 
 
 logger = logging.getLogger(__name__)
 SWH_PERSON = {
     'name': b'Software Heritage',
     'fullname': b'Software Heritage',
     'email': b'robot@softwareheritage.org'
 }
 REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
 
 
 class ArchiveLoader(PackageLoader):
     """Load archive origin's artifact files into swh archive
 
     """
     visit_type = 'tar'
 
     def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]],
                  identity_artifact_keys: Optional[Sequence[str]] = None):
         """Loader constructor.
 
         For now, this is the lister's task output.
 
         Args:
             url: Origin url
             artifacts: List of artifact information with keys:
 
                **time**: last modification time as either isoformat date string
                    or timestamp
                **url**: the artifact url to retrieve filename
                **artifact's filename version**: artifact's version length
                **length**: artifact's length
 
             identity_artifact_keys: Optional List of keys forming the
                 "identity" of an artifact
 
         """
         super().__init__(url=url)
         self.artifacts = artifacts  # assume order is enforced in the lister
         if not identity_artifact_keys:
             # default keys for gnu
             identity_artifact_keys = ['time', 'url', 'length', 'version']
         self.identity_artifact_keys = identity_artifact_keys
 
     def get_versions(self) -> Sequence[str]:
         versions = []
         for archive in self.artifacts:
             v = archive.get('version')
             if v:
                 versions.append(v)
         return versions
 
     def get_default_version(self) -> str:
         # It's the most recent, so for this loader, it's the last one
         return self.artifacts[-1]['version']
 
     def get_package_info(self, version: str) -> Generator[
             Tuple[str, Mapping[str, Any]], None, None]:
         for a_metadata in self.artifacts:
             url = a_metadata['url']
             package_version = a_metadata['version']
             if version == package_version:
                 filename = a_metadata.get('filename')
                 p_info = {
                     'url': url,
                     'filename': filename if filename else path.split(url)[-1],
                     'raw': a_metadata,
                 }
                 # FIXME: this code assumes we have only 1 artifact per
                 # versioned package
                 yield release_name(version), p_info
 
     def resolve_revision_from(
             self, known_artifacts: Dict, artifact_metadata: Dict) \
             -> Optional[bytes]:
         identity = artifact_identity(
             artifact_metadata, id_keys=self.identity_artifact_keys)
         for rev_id, known_artifact in known_artifacts.items():
             logging.debug('known_artifact: %s', known_artifact)
             reference_artifact = known_artifact['extrinsic']['raw']
             known_identity = artifact_identity(
                 reference_artifact, id_keys=self.identity_artifact_keys)
             if identity == known_identity:
                 return rev_id
         return None
 
     def build_revision(self, a_metadata: Mapping[str, Any],
                        uncompressed_path: str) -> Dict:
         time = a_metadata['time']  # assume it's a timestamp
         if isinstance(time, str):  # otherwise, assume it's a parsable date
             time = iso8601.parse_date(time)
         normalized_time = normalize_timestamp(time)
         return {
             'type': 'tar',
             'message': REVISION_MESSAGE,
             'date': normalized_time,
             'author': SWH_PERSON,
             'committer': SWH_PERSON,
             'committer_date': normalized_time,
             'parents': [],
             'metadata': {
                 'intrinsic': {},
                 'extrinsic': {
                     'provider': self.url,
                     'when': self.visit_date.isoformat(),
                     'raw': a_metadata,
                 },
             },
         }
-
-
-def artifact_identity(d: Mapping[str, Any],
-                      id_keys: Sequence[str]) -> Sequence[Any]:
-    """Compute the primary key for a dict using the id_keys as primary key
-       composite.
-
-    Args:
-        d: A dict entry to compute the primary key on
-        id_keys: Sequence of keys to use as primary key
-
-    Returns:
-        The identity for that dict entry
-
-    """
-    return [d.get(k) for k in id_keys]
diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py
index 75b45aa..de3a4d0 100644
--- a/swh/loader/package/archive/tests/test_archive.py
+++ b/swh/loader/package/archive/tests/test_archive.py
@@ -1,366 +1,345 @@
 # Copyright (C) 2019 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.model.hashutil import hash_to_bytes
 
-from swh.loader.package.archive.loader import ArchiveLoader, artifact_identity
+from swh.loader.package.archive.loader import ArchiveLoader
 from swh.loader.package.tests.common import (
     check_snapshot, check_metadata_paths, get_stats
 )
 
 
 URL = 'https://ftp.gnu.org/gnu/8sync/'
 GNU_ARTIFACTS = [
     {
         'time': 944729610,
         'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
         'length': 221837,
         'filename': '8sync-0.1.0.tar.gz',
         'version': '0.1.0',
     }
 ]
 
 _expected_new_contents_first_visit = [
     'e9258d81faf5881a2f96a77ba609396f82cb97ad',
     '1170cf105b04b7e2822a0e09d2acf71da7b9a130',
     'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac',
     '0057bec9b5422aff9256af240b177ac0e3ac2608',
     '2b8d0d0b43a1078fc708930c8ddc2956a86c566e',
     '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55',
     '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b',
     'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62',
     'edeb33282b2bffa0e608e9d2fd960fd08093c0ea',
     'd64e64d4c73679323f8d4cde2643331ba6c20af9',
     '7a756602914be889c0a2d3952c710144b3e64cb0',
     '84fb589b554fcb7f32b806951dcf19518d67b08f',
     '8624bcdae55baeef00cd11d5dfcfa60f68710a02',
     'e08441aeab02704cfbd435d6445f7c072f8f524e',
     'f67935bc3a83a67259cda4b2d43373bd56703844',
     '809788434b433eb2e3cfabd5d591c9a659d5e3d8',
     '7d7c6c8c5ebaeff879f61f37083a3854184f6c41',
     'b99fec102eb24bffd53ab61fc30d59e810f116a2',
     '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68',
     'f0c97052e567948adf03e641301e9983c478ccff',
     '7fb724242e2b62b85ca64190c31dcae5303e19b3',
     '4f9709e64a9134fe8aefb36fd827b84d8b617ab5',
     '7350628ccf194c2c3afba4ac588c33e3f3ac778d',
     '0bb892d9391aa706dc2c3b1906567df43cbe06a2',
     '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c',
     '6b5cc594ac466351450f7f64a0b79fdaf4435ad3',
     '3046e5d1f70297e2a507b98224b6222c9688d610',
     '1572607d456d7f633bc6065a2b3048496d679a31',
 ]
 
 _expected_new_directories_first_visit = [
     'daabc65ec75d487b1335ffc101c0ac11c803f8fc',
     '263be23b4a8101d3ad0d9831319a3e0f2b065f36',
     '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c',
     '4db0a3ecbc976083e2dac01a62f93729698429a3',
     'dfef1c80e1098dd5deda664bb44a9ab1f738af13',
     'eca971d346ea54d95a6e19d5051f900237fafdaa',
     '3aebc29ed1fccc4a6f2f2010fb8e57882406b528',
 ]
 
 _expected_new_revisions_first_visit = {
     '44183488c0774ce3c957fa19ba695cf18a4a42b3':
     '3aebc29ed1fccc4a6f2f2010fb8e57882406b528'
 }
 
 _expected_branches_first_visit = {
     'HEAD': {
         'target_type': 'alias',
         'target': 'releases/0.1.0',
     },
     'releases/0.1.0': {
         'target_type': 'revision',
         'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3',
     },
 }
 
 # hash is different then before as we changed the snapshot
 # gnu used to use `release/` (singular) instead of plural
 _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5'  # noqa
 
 
 def visit_with_no_artifact_found(swh_config, requests_mock_datadir):
     url = URL
     unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz'
     loader = ArchiveLoader(url, artifacts=[
         {
             'time': 944729610,
             'url': unknown_artifact_url,  # unknown artifact
             'length': 221837,
             'filename': '8sync-0.1.0.tar.gz',
             'version': '0.1.0',
         }
     ])
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'uneventful'
     assert actual_load_status['snapshot_id'] is not None
     stats = get_stats(loader.storage)
 
     assert {
         'content': 0,
         'directory': 0,
         'origin': 1,
         'origin_visit': 1,
         'person': 0,
         'release': 0,
         'revision': 0,
         'skipped_content': 0,
         'snapshot': 1,
     } == stats
 
     origin_visit = next(loader.storage.origin_visit_get(url))
     assert origin_visit['status'] == 'partial'
     assert origin_visit['type'] == 'tar'
 
 
 def test_check_revision_metadata_structure(swh_config, requests_mock_datadir):
     loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'eventful'
     assert actual_load_status['snapshot_id'] is not None
 
     expected_revision_id = hash_to_bytes(
         '44183488c0774ce3c957fa19ba695cf18a4a42b3')
     revision = list(loader.storage.revision_get([expected_revision_id]))[0]
 
     assert revision is not None
 
     check_metadata_paths(revision['metadata'], paths=[
         ('intrinsic', dict),
         ('extrinsic.provider', str),
         ('extrinsic.when', str),
         ('extrinsic.raw', dict),
         ('original_artifact', list),
     ])
 
     for original_artifact in revision['metadata']['original_artifact']:
         check_metadata_paths(original_artifact, paths=[
             ('filename', str),
             ('length', int),
             ('checksums', dict),
         ])
 
 
 def test_visit_with_release_artifact_no_prior_visit(
         swh_config, requests_mock_datadir):
     """With no prior visit, load a gnu project ends up with 1 snapshot
 
     """
     loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'eventful'
     assert actual_load_status['snapshot_id'] == _expected_new_snapshot_first_visit_id  # noqa
 
     stats = get_stats(loader.storage)
     assert {
         'content': len(_expected_new_contents_first_visit),
         'directory': len(_expected_new_directories_first_visit),
         'origin': 1,
         'origin_visit': 1,
         'person': 1,
         'release': 0,
         'revision': len(_expected_new_revisions_first_visit),
         'skipped_content': 0,
         'snapshot': 1
     } == stats
 
     expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit)
     assert list(loader.storage.content_missing_per_sha1(expected_contents)) \
         == []
 
     expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit)
     assert list(loader.storage.directory_missing(expected_dirs)) == []
 
     expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit)
     assert list(loader.storage.revision_missing(expected_revs)) == []
 
     expected_snapshot = {
         'id': _expected_new_snapshot_first_visit_id,
         'branches': _expected_branches_first_visit,
     }
 
     check_snapshot(expected_snapshot, loader.storage)
 
 
 def test_2_visits_without_change(swh_config, requests_mock_datadir):
     """With no prior visit, load a gnu project ends up with 1 snapshot
 
     """
     url = URL
     loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS)
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'eventful'
     assert actual_load_status['snapshot_id'] is not None
     origin_visit = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit['status'] == 'full'
     assert origin_visit['type'] == 'tar'
 
     actual_load_status2 = loader.load()
     assert actual_load_status2['status'] == 'uneventful'
     assert actual_load_status2['snapshot_id'] is not None
 
     assert actual_load_status['snapshot_id'] == actual_load_status2[
         'snapshot_id']
 
     origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit2['status'] == 'full'
     assert origin_visit2['type'] == 'tar'
 
     urls = [
         m.url for m in requests_mock_datadir.request_history
         if m.url.startswith('https://ftp.gnu.org')
     ]
     assert len(urls) == 1
 
 
 def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir):
     """With no prior visit, load a gnu project ends up with 1 snapshot
 
     """
     url = URL
     artifact1 = GNU_ARTIFACTS[0]
     loader = ArchiveLoader(url, [artifact1])
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'eventful'
     assert actual_load_status['snapshot_id'] is not None
 
     origin_visit = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit['status'] == 'full'
     assert origin_visit['type'] == 'tar'
 
     stats = get_stats(loader.storage)
     assert {
         'content': len(_expected_new_contents_first_visit),
         'directory': len(_expected_new_directories_first_visit),
         'origin': 1,
         'origin_visit': 1,
         'person': 1,
         'release': 0,
         'revision': len(_expected_new_revisions_first_visit),
         'skipped_content': 0,
         'snapshot': 1
     } == stats
 
     urls = [
         m.url for m in requests_mock_datadir.request_history
         if m.url.startswith('https://ftp.gnu.org')
     ]
     assert len(urls) == 1
 
     artifact2 = {
         'time': 1480991830,
         'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
         'length': 238466,
         'filename': '8sync-0.2.0.tar.gz',
         'version': '0.2.0',
     }
 
     loader2 = ArchiveLoader(url, [artifact1, artifact2])
     # implementation detail: share the storage in between visits
     loader2.storage = loader.storage
     stats2 = get_stats(loader2.storage)
     assert stats == stats2  # ensure we share the storage
 
     actual_load_status2 = loader2.load()
     assert actual_load_status2['status'] == 'eventful'
     assert actual_load_status2['snapshot_id'] is not None
 
     stats2 = get_stats(loader.storage)
     assert {
         'content': len(_expected_new_contents_first_visit) + 14,
         'directory': len(_expected_new_directories_first_visit) + 8,
         'origin': 1,
         'origin_visit': 1 + 1,
         'person': 1,
         'release': 0,
         'revision': len(_expected_new_revisions_first_visit) + 1,
         'skipped_content': 0,
         'snapshot': 1 + 1,
     } == stats2
 
     origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit2['status'] == 'full'
     assert origin_visit2['type'] == 'tar'
 
     urls = [
         m.url for m in requests_mock_datadir.request_history
         if m.url.startswith('https://ftp.gnu.org')
     ]
     # 1 artifact (2nd time no modification) + 1 new artifact
     assert len(urls) == 2
 
 
-def test_artifact_identity():
-    """Compute primary key should return the right identity
-
-    """
-    data = {
-        'a': 1,
-        'b': 2,
-        'length': 221837,
-        'filename': '8sync-0.1.0.tar.gz',
-        'version': '0.1.0',
-    }
-
-    for id_keys, expected_id in [
-            (['a', 'b'], [1, 2]),
-            ([], []),
-            (['a', 'key-that-does-not-exist'], [1, None])
-    ]:
-        actual_id = artifact_identity(data, id_keys=id_keys)
-        assert actual_id == expected_id
-
-
 def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir):
     """Load a project archive (not gnu) ends up with 1 snapshot
 
     """
     url = 'https://something.else.org/8sync/'
     artifacts = [  # this is not a gnu artifact
         {
             'time': '1999-12-09T09:53:30+00:00',  # it's also not a timestamp
             'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4',  # noqa
             # keep a gnu artifact reference to avoid adding other test files
             'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
             'length': 238466,
             'filename': '8sync-0.2.0.tar.gz',
             'version': '0.2.0',
         }
     ]
 
     # Here the loader defines the id_keys to use for existence in the snapshot
     # It's not the default archive loader which
     loader = ArchiveLoader(
         url, artifacts=artifacts, identity_artifact_keys=[
             'sha256', 'length', 'url'])
 
     actual_load_status = loader.load()
     assert actual_load_status['status'] == 'eventful'
     assert actual_load_status['snapshot_id'] is not None
     origin_visit = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit['status'] == 'full'
     assert origin_visit['type'] == 'tar'
 
     actual_load_status2 = loader.load()
     assert actual_load_status2['status'] == 'uneventful'
     assert actual_load_status2['snapshot_id'] == actual_load_status[
         'snapshot_id']
     origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
     assert origin_visit2['status'] == 'full'
     assert origin_visit2['type'] == 'tar'
 
     urls = [
         m.url for m in requests_mock_datadir.request_history
         if m.url.startswith('https://ftp.gnu.org')
     ]
     assert len(urls) == 1
diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py
index 63024b8..9f40bcb 100644
--- a/swh/loader/package/tests/test_utils.py
+++ b/swh/loader/package/tests/test_utils.py
@@ -1,307 +1,322 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 import os
 import pytest
 
 
 import swh.loader.package
 from swh.loader.package.utils import (
-    download, api_info, release_name, parse_author
+    download, api_info, release_name, parse_author, artifact_identity
 )
 
 
 def test_version_generation():
     assert swh.loader.package.__version__ != 'devel', \
         "Make sure swh.loader.core is installed (e.g. pip install -e .)"
 
 
 @pytest.mark.fs
 def test_download_fail_to_download(tmp_path, requests_mock):
     url = 'https://pypi.org/pypi/arrow/json'
     status_code = 404
     requests_mock.get(url, status_code=status_code)
 
     with pytest.raises(ValueError) as e:
         download(url, tmp_path)
 
     assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % (
         url, status_code)
 
 
 @pytest.mark.fs
 def test_download_ok(tmp_path, requests_mock):
     """Download without issue should provide filename and hashes"""
     filename = 'requests-0.0.1.tar.gz'
     url = 'https://pypi.org/pypi/requests/%s' % filename
     data = 'this is something'
     requests_mock.get(url, text=data, headers={
         'content-length': str(len(data))
     })
 
     actual_filepath, actual_hashes = download(url, dest=str(tmp_path))
 
     actual_filename = os.path.basename(actual_filepath)
     assert actual_filename == filename
     assert actual_hashes['length'] == len(data)
     assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2'  # noqa
     assert (actual_hashes['checksums']['sha256'] ==
             '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5')
 
 
 @pytest.mark.fs
 def test_download_ok_no_header(tmp_path, requests_mock):
     """Download without issue should provide filename and hashes"""
     filename = 'requests-0.0.1.tar.gz'
     url = 'https://pypi.org/pypi/requests/%s' % filename
     data = 'this is something'
     requests_mock.get(url, text=data)  # no header information
 
     actual_filepath, actual_hashes = download(url, dest=str(tmp_path))
 
     actual_filename = os.path.basename(actual_filepath)
     assert actual_filename == filename
     assert actual_hashes['length'] == len(data)
     assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2'  # noqa
     assert (actual_hashes['checksums']['sha256'] ==
             '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5')
 
 
 @pytest.mark.fs
 def test_download_ok_with_hashes(tmp_path, requests_mock):
     """Download without issue should provide filename and hashes"""
     filename = 'requests-0.0.1.tar.gz'
     url = 'https://pypi.org/pypi/requests/%s' % filename
     data = 'this is something'
     requests_mock.get(url, text=data, headers={
         'content-length': str(len(data))
     })
 
     # good hashes for such file
     good = {
         'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2',
         'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5',  # noqa
     }
 
     actual_filepath, actual_hashes = download(url, dest=str(tmp_path),
                                               hashes=good)
 
     actual_filename = os.path.basename(actual_filepath)
     assert actual_filename == filename
     assert actual_hashes['length'] == len(data)
     assert actual_hashes['checksums']['sha1'] == good['sha1']
     assert actual_hashes['checksums']['sha256'] == good['sha256']
 
 
 @pytest.mark.fs
 def test_download_fail_hashes_mismatch(tmp_path, requests_mock):
     """Mismatch hash after download should raise
 
     """
     filename = 'requests-0.0.1.tar.gz'
     url = 'https://pypi.org/pypi/requests/%s' % filename
     data = 'this is something'
     requests_mock.get(url, text=data, headers={
         'content-length': str(len(data))
     })
 
     # good hashes for such file
     good = {
         'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2',
         'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5',  # noqa
     }
 
     for hash_algo in good.keys():
         wrong_hash = good[hash_algo].replace('1', '0')
         expected_hashes = good.copy()
         expected_hashes[hash_algo] = wrong_hash  # set the wrong hash
 
         expected_msg = ("Failure when fetching %s. "
                         "Checksum mismatched: %s != %s" % (
                             url, wrong_hash, good[hash_algo]
                         ))
 
         with pytest.raises(ValueError, match=expected_msg):
             download(url, dest=str(tmp_path), hashes=expected_hashes)
 
 
 def test_api_info_failure(requests_mock):
     """Failure to fetch info/release information should raise"""
     url = 'https://pypi.org/pypi/requests/json'
     status_code = 400
     requests_mock.get(url, status_code=status_code)
 
     with pytest.raises(ValueError) as e0:
         api_info(url)
 
     assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (
         url, status_code
     )
 
 
 def test_api_info(requests_mock):
     """Fetching json info from pypi project should be ok"""
     url = 'https://pypi.org/pypi/requests/json'
     requests_mock.get(url, text='{"version": "0.0.1"}')
     actual_info = api_info(url)
     assert actual_info == {
         'version': '0.0.1',
     }
 
 
 def test_release_name():
     for version, filename, expected_release in [
             ('0.0.1', None, 'releases/0.0.1'),
             ('0.0.2', 'something', 'releases/0.0.2/something')]:
         assert release_name(version, filename) == expected_release
 
 
 def _parse_author_string_test(author_str, expected_result):
     assert parse_author(author_str) == expected_result
     assert parse_author(' %s' % author_str) == expected_result
     assert parse_author('%s ' % author_str) == expected_result
 
 
 def test_parse_author():
     _parse_author_string_test(
         'John Doe',
         {
             'name': 'John Doe'
         }
     )
 
     _parse_author_string_test(
         '<john.doe@foo.bar>',
         {
             'email': 'john.doe@foo.bar'
         }
     )
 
     _parse_author_string_test(
         '(https://john.doe)',
         {
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe <john.doe@foo.bar>',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar'
         }
     )
 
     _parse_author_string_test(
         'John Doe<john.doe@foo.bar>',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar'
         }
     )
 
     _parse_author_string_test(
         'John Doe (https://john.doe)',
         {
             'name': 'John Doe',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe(https://john.doe)',
         {
             'name': 'John Doe',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         '<john.doe@foo.bar> (https://john.doe)',
         {
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         '(https://john.doe) <john.doe@foo.bar>',
         {
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe <john.doe@foo.bar> (https://john.doe)',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe (https://john.doe) <john.doe@foo.bar>',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe<john.doe@foo.bar> (https://john.doe)',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe<john.doe@foo.bar>(https://john.doe)',
         {
             'name': 'John Doe',
             'email': 'john.doe@foo.bar',
             'url': 'https://john.doe'
         }
     )
 
     _parse_author_string_test('', {})
     _parse_author_string_test('<>', {})
     _parse_author_string_test(' <>', {})
     _parse_author_string_test('<>()', {})
     _parse_author_string_test('<> ()', {})
     _parse_author_string_test('()', {})
     _parse_author_string_test(' ()', {})
 
     _parse_author_string_test(
         'John Doe <> ()',
         {
             'name': 'John Doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe <>',
         {
             'name': 'John Doe'
         }
     )
 
     _parse_author_string_test(
         'John Doe ()',
         {
             'name': 'John Doe'
         }
     )
 
 
-# def test_swh_author():
-#     for author, expected_author in [
-#             ({}, )
-#     ]:
+def test_artifact_identity():
+    """Compute primary key should return the right identity
+
+    """
+    data = {
+        'a': 1,
+        'b': 2,
+        'length': 221837,
+        'filename': '8sync-0.1.0.tar.gz',
+        'version': '0.1.0',
+    }
+
+    for id_keys, expected_id in [
+            (['a', 'b'], [1, 2]),
+            ([], []),
+            (['a', 'key-that-does-not-exist'], [1, None])
+    ]:
+        actual_id = artifact_identity(data, id_keys=id_keys)
+        assert actual_id == expected_id
diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py
index 9f78a35..eff017f 100644
--- a/swh/loader/package/utils.py
+++ b/swh/loader/package/utils.py
@@ -1,197 +1,213 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 import logging
 import os
 import requests
 import re
 
-from typing import Dict, Optional, Tuple
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
 
 from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
 from swh.loader.package import DEFAULT_PARAMS
 
 
 logger = logging.getLogger(__name__)
 
 
 DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length'])
 
 
 # https://github.com/jonschlinkert/author-regex
 _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
 
 
 _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
 
 
 def api_info(url: str) -> Dict:
     """Basic api client to retrieve information on project. This deals with
        fetching json metadata about pypi projects.
 
     Args:
         url (str): The api url (e.g PyPI, npm, etc...)
 
     Raises:
         ValueError in case of query failures (for some reasons: 404, ...)
 
     Returns:
         The associated response's information dict
 
     """
     response = requests.get(url, **DEFAULT_PARAMS)
     if response.status_code != 200:
         raise ValueError("Fail to query '%s'. Reason: %s" % (
             url, response.status_code))
     return response.json()
 
 
 def download(url: str, dest: str, hashes: Dict = {},
              filename: Optional[str] = None,
              auth: Optional[Tuple[str, str]] = None) -> Tuple[str, Dict]:
     """Download a remote tarball from url, uncompresses and computes swh hashes
        on it.
 
     Args:
         url: Artifact uri to fetch, uncompress and hash
         dest: Directory to write the archive to
         hashes: Dict of expected hashes (key is the hash algo) for the artifact
             to download (those hashes are expected to be hex string)
         auth: Optional tuple of login/password (for http authentication
             service, e.g. deposit)
 
     Raises:
         ValueError in case of any error when fetching/computing (length,
         checksums mismatched...)
 
     Returns:
         Tuple of local (filepath, hashes of filepath)
 
     """
     params = copy.deepcopy(DEFAULT_PARAMS)
     if auth is not None:
         params['auth'] = auth
     response = requests.get(url, **params, stream=True)
     if response.status_code != 200:
         raise ValueError("Fail to query '%s'. Reason: %s" % (
             url, response.status_code))
 
     filename = filename if filename else os.path.basename(url)
     logger.debug('filename: %s', filename)
     filepath = os.path.join(dest, filename)
     logger.debug('filepath: %s', filepath)
 
     h = MultiHash(hash_names=DOWNLOAD_HASHES)
     with open(filepath, 'wb') as f:
         for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
             h.update(chunk)
             f.write(chunk)
 
     # Also check the expected hashes if provided
     if hashes:
         actual_hashes = h.hexdigest()
         for algo_hash in hashes.keys():
             actual_digest = actual_hashes[algo_hash]
             expected_digest = hashes[algo_hash]
             if actual_digest != expected_digest:
                 raise ValueError(
                     'Failure when fetching %s. '
                     'Checksum mismatched: %s != %s' % (
                         url, expected_digest, actual_digest))
 
     computed_hashes = h.hexdigest()
     length = computed_hashes.pop('length')
     extrinsic_metadata = {
         'length': length,
         'filename': filename,
         'checksums': computed_hashes,
     }
 
     logger.debug('extrinsic_metadata', extrinsic_metadata)
 
     return filepath, extrinsic_metadata
 
 
 def release_name(version: str, filename: Optional[str] = None) -> str:
     if filename:
         return 'releases/%s/%s' % (version, filename)
     return 'releases/%s' % version
 
 
 def parse_author(author_str: str) -> Dict[str, str]:
     """
     Parse npm package author string.
 
     It works with a flexible range of formats, as detailed below::
 
         name
         name <email> (url)
         name <email>(url)
         name<email> (url)
         name<email>(url)
         name (url) <email>
         name (url)<email>
         name(url) <email>
         name(url)<email>
         name (url)
         name(url)
         name <email>
         name<email>
         <email> (url)
         <email>(url)
         (url) <email>
         (url)<email>
         <email>
         (url)
 
     Args:
         author_str (str): input author string
 
     Returns:
         dict: A dict that may contain the following keys:
             * name
             * email
             * url
 
     """
     author = {}
     matches = re.findall(_author_regexp,
                          author_str.replace('<>', '').replace('()', ''),
                          re.M)
     for match in matches:
         if match[0].strip():
             author['name'] = match[0].strip()
         if match[1].strip():
             author['email'] = match[1].strip()
         if match[2].strip():
             author['url'] = match[2].strip()
     return author
 
 
 def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]:
     """Transform an author like dict to an expected swh like dict (values are
     bytes)
 
     """
     name = author.get('name')
     email = author.get('email')
 
     fullname = None
 
     if name and email:
         fullname = '%s <%s>' % (name, email)
     elif name:
         fullname = name
 
     if not fullname:
         r = _EMPTY_AUTHOR
     else:
         r = {
             'fullname': fullname.encode('utf-8') if fullname else None,
             'name': name.encode('utf-8') if name else None,
             'email': email.encode('utf-8') if email else None
         }
     return r
+
+
+def artifact_identity(d: Mapping[str, Any],
+                      id_keys: Sequence[str]) -> List[Any]:
+    """Compute the primary key for a dict using the id_keys as primary key
+       composite.
+
+    Args:
+        d: A dict entry to compute the primary key on
+        id_keys: Sequence of keys to use as primary key
+
+    Returns:
+        The identity for that dict entry
+
+    """
+    return [d.get(k) for k in id_keys]