Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123587
from_disk.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
from_disk.py
View Options
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from
collections
import
defaultdict
from
datetime
import
datetime
import
os
import
shutil
from
typing
import
Dict
,
Optional
from
dulwich.errors
import
ObjectFormatException
try
:
from
dulwich.errors
import
EmptyFileException
# type: ignore
except
ImportError
:
# dulwich >= 0.20
from
dulwich.objects
import
EmptyFileException
import
dulwich.objects
import
dulwich.repo
from
swh.loader.core.loader
import
DVCSLoader
from
swh.model
import
hashutil
from
swh.model.model
import
Origin
,
Snapshot
,
SnapshotBranch
,
TargetType
from
swh.storage.algos.origin
import
origin_get_latest_visit_status
from
swh.storage.interface
import
StorageInterface
from
.
import
converters
,
utils
def
_check_tag
(
tag
):
"""Copy-paste of dulwich.objects.Tag, minus the tagger and time checks,
which are too strict and error on old tags."""
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
#
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
# General Public License as public by the Free Software Foundation; version 2.0
# or (at your option) any later version. You can redistribute it and/or
# modify it under the terms of either of these two licenses.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# You should have received a copy of the licenses; if not, see
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
# License, Version 2.0.
dulwich
.
objects
.
ShaFile
.
check
(
tag
)
tag
.
_check_has_member
(
"_object_sha"
,
"missing object sha"
)
tag
.
_check_has_member
(
"_object_class"
,
"missing object type"
)
tag
.
_check_has_member
(
"_name"
,
"missing tag name"
)
if
not
tag
.
_name
:
raise
ObjectFormatException
(
"empty tag name"
)
dulwich
.
objects
.
check_hexsha
(
tag
.
_object_sha
,
"invalid object sha"
)
if
tag
.
_tag_time
is
not
None
:
dulwich
.
objects
.
check_time
(
tag
.
_tag_time
)
from
dulwich.objects
import
(
_OBJECT_HEADER
,
_TAG_HEADER
,
_TAGGER_HEADER
,
_TYPE_HEADER
,
)
last
=
None
for
field
,
_
in
dulwich
.
objects
.
_parse_message
(
tag
.
_chunked_text
):
if
field
==
_OBJECT_HEADER
and
last
is
not
None
:
raise
ObjectFormatException
(
"unexpected object"
)
elif
field
==
_TYPE_HEADER
and
last
!=
_OBJECT_HEADER
:
raise
ObjectFormatException
(
"unexpected type"
)
elif
field
==
_TAG_HEADER
and
last
!=
_TYPE_HEADER
:
raise
ObjectFormatException
(
"unexpected tag name"
)
elif
field
==
_TAGGER_HEADER
and
last
!=
_TAG_HEADER
:
raise
ObjectFormatException
(
"unexpected tagger"
)
last
=
field
class
GitLoaderFromDisk
(
DVCSLoader
):
"""Load a git repository from a directory.
"""
visit_type
=
"git"
def
__init__
(
self
,
storage
:
StorageInterface
,
url
:
str
,
visit_date
:
Optional
[
datetime
]
=
None
,
directory
:
Optional
[
str
]
=
None
,
save_data_path
:
Optional
[
str
]
=
None
,
max_content_size
:
Optional
[
int
]
=
None
,
):
super
()
.
__init__
(
storage
=
storage
,
save_data_path
=
save_data_path
,
max_content_size
=
max_content_size
,
)
self
.
origin_url
=
url
self
.
visit_date
=
visit_date
self
.
directory
=
directory
def
prepare_origin_visit
(
self
):
self
.
origin
=
Origin
(
url
=
self
.
origin_url
)
def
prepare
(
self
):
self
.
repo
=
dulwich
.
repo
.
Repo
(
self
.
directory
)
def
iter_objects
(
self
):
object_store
=
self
.
repo
.
object_store
for
pack
in
object_store
.
packs
:
objs
=
list
(
pack
.
index
.
iterentries
())
objs
.
sort
(
key
=
lambda
x
:
x
[
1
])
for
sha
,
offset
,
crc32
in
objs
:
yield
hashutil
.
hash_to_bytehex
(
sha
)
yield from
object_store
.
_iter_loose_objects
()
yield from
object_store
.
_iter_alternate_objects
()
def
_check
(
self
,
obj
):
"""Check the object's repository representation.
If any errors in check exists, an ObjectFormatException is
raised.
Args:
obj (object): Dulwich object read from the repository.
"""
if
isinstance
(
obj
,
dulwich
.
objects
.
Tag
):
_check_tag
(
obj
)
else
:
obj
.
check
()
try
:
# For additional checks on dulwich objects with date
# for now, only checks on *time
if
isinstance
(
obj
,
dulwich
.
objects
.
Commit
):
commit_time
=
obj
.
_commit_time
utils
.
check_date_time
(
commit_time
)
author_time
=
obj
.
_author_time
utils
.
check_date_time
(
author_time
)
elif
isinstance
(
obj
,
dulwich
.
objects
.
Tag
):
tag_time
=
obj
.
_tag_time
if
tag_time
:
utils
.
check_date_time
(
tag_time
)
except
Exception
as
e
:
raise
ObjectFormatException
(
e
)
def
get_object
(
self
,
oid
):
"""Given an object id, return the object if it is found and not
malformed in some way.
Args:
oid (bytes): the object's identifier
Returns:
The object if found without malformation
"""
try
:
# some errors are raised when reading the object
obj
=
self
.
repo
[
oid
]
# some we need to check ourselves
self
.
_check
(
obj
)
except
KeyError
:
_id
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
not found, skipping"
%
_id
,
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
_id
,
"origin_url"
:
self
.
origin
.
url
,
},
)
return
None
except
ObjectFormatException
as
e
:
id_
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
malformed (
%s
), skipping"
,
id_
,
e
.
args
[
0
],
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
id_
,
"origin_url"
:
self
.
origin
.
url
,
},
)
return
None
except
EmptyFileException
:
id_
=
oid
.
decode
(
"utf-8"
)
self
.
log
.
warn
(
"object
%s
corrupted (empty file), skipping"
,
id_
,
extra
=
{
"swh_type"
:
"swh_loader_git_missing_object"
,
"swh_object_id"
:
id_
,
"origin_url"
:
self
.
origin
.
url
,
},
)
else
:
return
obj
def
fetch_data
(
self
):
"""Fetch the data from the data source"""
visit_status
=
origin_get_latest_visit_status
(
self
.
storage
,
self
.
origin_url
,
require_snapshot
=
True
)
self
.
previous_snapshot_id
=
(
None
if
visit_status
is
None
else
visit_status
.
snapshot
)
type_to_ids
=
defaultdict
(
list
)
for
oid
in
self
.
iter_objects
():
obj
=
self
.
get_object
(
oid
)
if
obj
is
None
:
continue
type_name
=
obj
.
type_name
type_to_ids
[
type_name
]
.
append
(
oid
)
self
.
type_to_ids
=
type_to_ids
def
has_contents
(
self
):
"""Checks whether we need to load contents"""
return
bool
(
self
.
type_to_ids
[
b
"blob"
])
def
get_content_ids
(
self
):
"""Get the content identifiers from the git repository"""
for
oid
in
self
.
type_to_ids
[
b
"blob"
]:
yield
converters
.
dulwich_blob_to_content_id
(
self
.
repo
[
oid
])
def
get_contents
(
self
):
"""Get the contents that need to be loaded"""
missing_contents
=
set
(
self
.
storage
.
content_missing
(
self
.
get_content_ids
(),
"sha1_git"
)
)
for
oid
in
missing_contents
:
yield
converters
.
dulwich_blob_to_content
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)]
)
def
has_directories
(
self
):
"""Checks whether we need to load directories"""
return
bool
(
self
.
type_to_ids
[
b
"tree"
])
def
get_directory_ids
(
self
):
"""Get the directory identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"tree"
])
def
get_directories
(
self
):
"""Get the directories that need to be loaded"""
missing_dirs
=
set
(
self
.
storage
.
directory_missing
(
sorted
(
self
.
get_directory_ids
()))
)
for
oid
in
missing_dirs
:
yield
converters
.
dulwich_tree_to_directory
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
has_revisions
(
self
):
"""Checks whether we need to load revisions"""
return
bool
(
self
.
type_to_ids
[
b
"commit"
])
def
get_revision_ids
(
self
):
"""Get the revision identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"commit"
]
)
def
get_revisions
(
self
):
"""Get the revisions that need to be loaded"""
missing_revs
=
set
(
self
.
storage
.
revision_missing
(
sorted
(
self
.
get_revision_ids
()))
)
for
oid
in
missing_revs
:
yield
converters
.
dulwich_commit_to_revision
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
has_releases
(
self
):
"""Checks whether we need to load releases"""
return
bool
(
self
.
type_to_ids
[
b
"tag"
])
def
get_release_ids
(
self
):
"""Get the release identifiers from the git repository"""
return
(
hashutil
.
hash_to_bytes
(
id
.
decode
())
for
id
in
self
.
type_to_ids
[
b
"tag"
])
def
get_releases
(
self
):
"""Get the releases that need to be loaded"""
missing_rels
=
set
(
self
.
storage
.
release_missing
(
sorted
(
self
.
get_release_ids
())))
for
oid
in
missing_rels
:
yield
converters
.
dulwich_tag_to_release
(
self
.
repo
[
hashutil
.
hash_to_bytehex
(
oid
)],
log
=
self
.
log
)
def
get_snapshot
(
self
):
"""Turn the list of branches into a snapshot to load"""
branches
:
Dict
[
bytes
,
Optional
[
SnapshotBranch
]]
=
{}
for
ref
,
target
in
self
.
repo
.
refs
.
as_dict
()
.
items
():
if
utils
.
ignore_branch_name
(
ref
):
continue
obj
=
self
.
get_object
(
target
)
if
obj
:
target_type
=
converters
.
DULWICH_TARGET_TYPES
[
obj
.
type_name
]
branches
[
ref
]
=
SnapshotBranch
(
target
=
hashutil
.
bytehex_to_hash
(
target
),
target_type
=
target_type
,
)
else
:
branches
[
ref
]
=
None
dangling_branches
=
{}
for
ref
,
target
in
self
.
repo
.
refs
.
get_symrefs
()
.
items
():
if
utils
.
ignore_branch_name
(
ref
):
continue
branches
[
ref
]
=
SnapshotBranch
(
target
=
target
,
target_type
=
TargetType
.
ALIAS
)
if
target
not
in
branches
:
# This handles the case where the pointer is "dangling".
# There's a chance that a further symbolic reference will
# override this default value, which is totally fine.
dangling_branches
[
target
]
=
ref
branches
[
target
]
=
None
utils
.
warn_dangling_branches
(
branches
,
dangling_branches
,
self
.
log
,
self
.
origin_url
)
self
.
snapshot
=
Snapshot
(
branches
=
branches
)
return
self
.
snapshot
def
save_data
(
self
):
"""We already have the data locally, no need to save it"""
pass
def
load_status
(
self
):
"""The load was eventful if the current occurrences are different to
the ones we retrieved at the beginning of the run"""
eventful
=
False
if
self
.
previous_snapshot_id
:
eventful
=
self
.
snapshot
.
id
!=
self
.
previous_snapshot_id
else
:
eventful
=
bool
(
self
.
snapshot
.
branches
)
return
{
"status"
:
(
"eventful"
if
eventful
else
"uneventful"
)}
class
GitLoaderFromArchive
(
GitLoaderFromDisk
):
"""Load a git repository from an archive.
This loader ingests a git repository compressed into an archive.
The supported archive formats are ``.zip`` and ``.tar.gz``.
From an input tarball named ``my-git-repo.zip``, the following layout is
expected in it::
my-git-repo/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
Nevertheless, the loader is able to ingest tarballs with the following
layouts too::
.
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
or::
other-repo-name/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
"""
def
__init__
(
self
,
*
args
,
archive_path
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
temp_dir
=
self
.
repo_path
=
None
self
.
archive_path
=
archive_path
def
project_name_from_archive
(
self
,
archive_path
):
"""Compute the project name from the archive's path.
"""
archive_name
=
os
.
path
.
basename
(
archive_path
)
for
ext
in
(
".zip"
,
".tar.gz"
,
".tgz"
):
if
archive_name
.
lower
()
.
endswith
(
ext
):
archive_name
=
archive_name
[:
-
len
(
ext
)]
break
return
archive_name
def
prepare
(
self
):
"""1. Uncompress the archive in temporary location.
2. Prepare as the GitLoaderFromDisk does
3. Load as GitLoaderFromDisk does
"""
project_name
=
self
.
project_name_from_archive
(
self
.
archive_path
)
self
.
temp_dir
,
self
.
repo_path
=
utils
.
init_git_repo_from_archive
(
project_name
,
self
.
archive_path
)
self
.
log
.
info
(
"Project
%s
- Uncompressing archive
%s
at
%s
"
,
self
.
origin_url
,
os
.
path
.
basename
(
self
.
archive_path
),
self
.
repo_path
,
)
self
.
directory
=
self
.
repo_path
super
()
.
prepare
()
def
cleanup
(
self
):
"""Cleanup the temporary location (if it exists).
"""
if
self
.
temp_dir
and
os
.
path
.
exists
(
self
.
temp_dir
):
shutil
.
rmtree
(
self
.
temp_dir
)
self
.
log
.
info
(
"Project
%s
- Done injecting
%s
"
%
(
self
.
origin_url
,
self
.
repo_path
)
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Sat, Jun 21, 5:44 PM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3304990
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment