Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9312249
from_disk.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
from_disk.py
View Options
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from
collections
import
defaultdict
import
datetime
import
os
import
shutil
from
typing
import
Dict
,
Optional
from
dulwich.errors
import
ObjectFormatException
try
:
from
dulwich.errors
import
EmptyFileException
except
ImportError
:
# dulwich >= 0.20
from
dulwich.objects
import
EmptyFileException
import
dulwich.repo
from
swh.model
import
hashutil
from
swh.model.model
import
Origin
,
Snapshot
,
SnapshotBranch
,
TargetType
from
swh.loader.core.loader
import
DVCSLoader
from
swh.storage.algos.origin
import
origin_get_latest_visit_status
from
.
import
converters
,
utils
class
GitLoaderFromDisk
(
DVCSLoader
):
"""Load a git repository from a directory.
"""
CONFIG_BASE_FILENAME
=
"loader/git-disk"
visit_type
=
"git"
def
__init__
(
self
,
url
,
visit_date
=
None
,
directory
=
None
,
config
=
None
):
super
()
.
__init__
(
logging_class
=
"swh.loader.git.Loader"
,
config
=
config
)
self
.
origin_url
=
url
self
.
visit_date
=
visit_date
self
.
directory
=
directory
def
prepare_origin_visit
(
self
,
*
args
,
**
kwargs
):
self
.
origin
=
Origin
(
url
=
self
.
origin_url
)
def
prepare
(
self
,
*
args
,
**
kwargs
):
self
.
repo
=
dulwich
.
repo
.
Repo
(
self
.
directory
)
def
iter_objects
(
self
):
object_store
=
self
.
repo
.
object_store
for
pack
in
object_store
.
packs
:
objs
=
list
(
pack
.
index
.
iterentries
())
objs
.
sort
(
key
=
lambda
x
:
x
[
1
])
for
sha
,
offset
,
crc32
in
objs
:
yield
hashutil
.
hash_to_bytehex
(
sha
)
yield from
object_store
.
_iter_loose_objects
()
yield from
object_store
.
_iter_alternate_objects
()
def
_check
(
self
,
obj
):
"""Check the object's repository representation.
If any errors in check exists, an ObjectFormatException is
raised.
Args:
obj (object): Dulwich object read from the repository.
"""
obj
.
check
()
from
dulwich.objects
import
Commit
,
Tag
try
:
# For additional checks on dulwich objects with date
# for now, only checks on *time
if
isinstance
(
obj
,
Commit
):
commit_time
=
obj
.
_commit_time
utils
.
check_date_time
(
commit_time
)
author_time
=
obj
.
_author_time
utils
.
check_date_time
(
author_time
)
elif
isinstance
(
obj
,
Tag
):
tag_time
=
obj
.
_tag_time
utils
.
check_date_time
(
tag_time
)
except
Exception
as
e
:
raise
ObjectFormatException
(
e
)
def
get_object
(
self
,
oid
):
"""Given an object id, return the object if it is found and not
malformed in some way.
Args:
oid (bytes): the object's identifier
Returns:
The object if found without malformation
"""
try
:
# some errors are raised when reading the object
obj
=
self
.
repo
[
oid
]
# some we need to check ourselves
self
.
_check
(
obj
)
except
KeyError
:
_id
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
not found, skipping"
%
_id
,
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
_id
,
"origin_url"
:
self
.
origin
.
url
,
},
)
return
None
except
ObjectFormatException
:
_id
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
malformed, skipping"
%
_id
,
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
_id
,
"origin_url"
:
self
.
origin
.
url
,
},
)
return
None
except
EmptyFileException
:
_id
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
corrupted (empty file), skipping"
%
_id
,
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
_id
,
"origin_url"
:
self
.
origin
.
url
,
},
)
else
:
return
obj
def
fetch_data
(
self
):
"""Fetch the data from the data source"""
visit_and_status
=
origin_get_latest_visit_status
(
self
.
storage
,
self
.
origin_url
,
require_snapshot
=
True
)
if
visit_and_status
is
None
:
self
.
previous_snapshot_id
=
None
else
:
_
,
visit_status
=
visit_and_status
self
.
previous_snapshot_id
=
visit_status
.
snapshot
type_to_ids
=
defaultdict
(
list
)
for
oid
in
self
.
iter_objects
():
obj
=
self
.
get_object
(
oid
)
if
not
obj
:
continue
type_name
=
obj
.
type_name
type_to_ids
[
type_name
]
.
append
(
oid
)
self
.
type_to_ids
=
type_to_ids
def
has_contents
(
self
):
"""Checks whether we need to load contents"""
return
bool
(
self
.
type_to_ids
[
b
"blob"
])
def
get_content_ids
(
self
):
"""Get the content identifiers from the git repository"""
for
oid
in
self
.
type_to_ids
[
b
"blob"
]:
yield
converters
.
dulwich_blob_to_content_id
(
self
.
repo
[
oid
])
def
get_contents
(
self
):
"""Get the contents that need to be loaded"""
missing_contents
=
set
(
self
.
storage
.
content_missing
(
self
.
get_content_ids
(),
"sha1_git"
)
)
for
oid
in
missing_contents
:
yield
converters
.
dulwich_blob_to_content
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)]
)
def
has_directories
(
self
):
"""Checks whether we need to load directories"""
return
bool
(
self
.
type_to_ids
[
b
"tree"
])
def
get_directory_ids
(
self
):
"""Get the directory identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"tree"
])
def
get_directories
(
self
):
"""Get the directories that need to be loaded"""
missing_dirs
=
set
(
self
.
storage
.
directory_missing
(
sorted
(
self
.
get_directory_ids
()))
)
for
oid
in
missing_dirs
:
yield
converters
.
dulwich_tree_to_directory
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
has_revisions
(
self
):
"""Checks whether we need to load revisions"""
return
bool
(
self
.
type_to_ids
[
b
"commit"
])
def
get_revision_ids
(
self
):
"""Get the revision identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"commit"
]
)
def
get_revisions
(
self
):
"""Get the revisions that need to be loaded"""
missing_revs
=
set
(
self
.
storage
.
revision_missing
(
sorted
(
self
.
get_revision_ids
()))
)
for
oid
in
missing_revs
:
yield
converters
.
dulwich_commit_to_revision
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
has_releases
(
self
):
"""Checks whether we need to load releases"""
return
bool
(
self
.
type_to_ids
[
b
"tag"
])
def
get_release_ids
(
self
):
"""Get the release identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"tag"
])
def
get_releases
(
self
):
"""Get the releases that need to be loaded"""
missing_rels
=
set
(
self
.
storage
.
release_missing
(
sorted
(
self
.
get_release_ids
())))
for
oid
in
missing_rels
:
yield
converters
.
dulwich_tag_to_release
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
get_snapshot
(
self
):
"""Turn the list of branches into a snapshot to load"""
branches
:
Dict
[
bytes
,
Optional
[
SnapshotBranch
]]
=
{}
for
ref
,
target
in
self
.
repo
.
refs
.
as_dict
()
.
items
():
if
utils
.
ignore_branch_name
(
ref
):
continue
obj
=
self
.
get_object
(
target
)
if
obj
:
target_type
=
converters
.
DULWICH_TARGET_TYPES
[
obj
.
type_name
]
branches
[
ref
]
=
SnapshotBranch
(
target
=
hashutil
.
bytehex_to_hash
(
target
),
target_type
=
target_type
,
)
else
:
branches
[
ref
]
=
None
dangling_branches
=
{}
for
ref
,
target
in
self
.
repo
.
refs
.
get_symrefs
()
.
items
():
if
utils
.
ignore_branch_name
(
ref
):
continue
branches
[
ref
]
=
SnapshotBranch
(
target
=
target
,
target_type
=
TargetType
.
ALIAS
)
if
target
not
in
branches
:
# This handles the case where the pointer is "dangling".
# There's a chance that a further symbolic reference will
# override this default value, which is totally fine.
dangling_branches
[
target
]
=
ref
branches
[
target
]
=
None
utils
.
warn_dangling_branches
(
branches
,
dangling_branches
,
self
.
log
,
self
.
origin_url
)
self
.
snapshot
=
Snapshot
(
branches
=
branches
)
return
self
.
snapshot
def
get_fetch_history_result
(
self
):
"""Return the data to store in fetch_history for the current loader"""
return
{
"contents"
:
len
(
self
.
type_to_ids
[
b
"blob"
]),
"directories"
:
len
(
self
.
type_to_ids
[
b
"tree"
]),
"revisions"
:
len
(
self
.
type_to_ids
[
b
"commit"
]),
"releases"
:
len
(
self
.
type_to_ids
[
b
"tag"
]),
}
def
save_data
(
self
):
"""We already have the data locally, no need to save it"""
pass
def
load_status
(
self
):
"""The load was eventful if the current occurrences are different to
the ones we retrieved at the beginning of the run"""
eventful
=
False
if
self
.
previous_snapshot_id
:
eventful
=
self
.
snapshot
.
id
!=
self
.
previous_snapshot_id
else
:
eventful
=
bool
(
self
.
snapshot
.
branches
)
return
{
"status"
:
(
"eventful"
if
eventful
else
"uneventful"
)}
class
GitLoaderFromArchive
(
GitLoaderFromDisk
):
"""Load a git repository from an archive.
This loader ingests a git repository compressed into an archive.
The supported archive formats are ``.zip`` and ``.tar.gz``.
From an input tarball named ``my-git-repo.zip``, the following layout is
expected in it::
my-git-repo/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
Nevertheless, the loader is able to ingest tarballs with the following
layouts too::
.
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
or::
other-repo-name/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
"""
def
__init__
(
self
,
*
args
,
archive_path
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
temp_dir
=
self
.
repo_path
=
None
self
.
archive_path
=
archive_path
def
project_name_from_archive
(
self
,
archive_path
):
"""Compute the project name from the archive's path.
"""
archive_name
=
os
.
path
.
basename
(
archive_path
)
for
ext
in
(
".zip"
,
".tar.gz"
,
".tgz"
):
if
archive_name
.
lower
()
.
endswith
(
ext
):
archive_name
=
archive_name
[:
-
len
(
ext
)]
break
return
archive_name
def
prepare
(
self
,
*
args
,
**
kwargs
):
"""1. Uncompress the archive in temporary location.
2. Prepare as the GitLoaderFromDisk does
3. Load as GitLoaderFromDisk does
"""
project_name
=
self
.
project_name_from_archive
(
self
.
archive_path
)
self
.
temp_dir
,
self
.
repo_path
=
utils
.
init_git_repo_from_archive
(
project_name
,
self
.
archive_path
)
self
.
log
.
info
(
"Project
%s
- Uncompressing archive
%s
at
%s
"
,
self
.
origin_url
,
os
.
path
.
basename
(
self
.
archive_path
),
self
.
repo_path
,
)
self
.
directory
=
self
.
repo_path
super
()
.
prepare
(
*
args
,
**
kwargs
)
def
cleanup
(
self
):
"""Cleanup the temporary location (if it exists).
"""
if
self
.
temp_dir
and
os
.
path
.
exists
(
self
.
temp_dir
):
shutil
.
rmtree
(
self
.
temp_dir
)
self
.
log
.
info
(
"Project
%s
- Done injecting
%s
"
%
(
self
.
origin_url
,
self
.
repo_path
)
)
if
__name__
==
"__main__"
:
import
click
import
logging
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"
%(asctime)s
%(process)d
%(message)s
"
)
@click.command
()
@click.option
(
"--origin-url"
,
help
=
"origin url"
)
@click.option
(
"--git-directory"
,
help
=
"Path to git repository to load"
)
@click.option
(
"--visit-date"
,
default
=
None
,
help
=
"Visit date"
)
def
main
(
origin_url
,
git_directory
,
visit_date
):
if
not
visit_date
:
visit_date
=
datetime
.
datetime
.
now
(
tz
=
datetime
.
timezone
.
utc
)
return
GitLoaderFromDisk
()
.
load
(
origin_url
,
git_directory
,
visit_date
)
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Thu, Jul 3, 10:47 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3340688
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment