Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-loader-git
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Antoine Lambert
swh-loader-git
Commits
65bbe015
Commit
65bbe015
authored
9 years ago
by
Antoine R. Dumont
Browse files
Options
Downloads
Patches
Plain Diff
Use backend api to filter knowns sha1s
parent
376309e9
Branches
improve-object-loading
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
swh/gitloader/loader.py
+72
-33
72 additions, 33 deletions
swh/gitloader/loader.py
swh/http/client.py
+21
-0
21 additions, 0 deletions
swh/http/client.py
with
93 additions
and
33 deletions
swh/gitloader/loader.py
+
72
−
33
View file @
65bbe015
...
...
@@ -7,8 +7,9 @@
import
logging
import
pygit2
from
pygit2
import
GIT_REF_OID
from
pygit2
import
GIT_
FILEMODE_TREE
,
GIT_FILEMODE_COMMIT
,
GIT_OBJ_
COMMIT
from
pygit2
import
GIT_
OBJ_COMMIT
,
GIT_OBJ_TREE
,
GIT_OBJ_
BLOB
from
swh
import
hash
from
swh.storage
import
store
...
...
@@ -41,52 +42,90 @@ def load_repo(baseurl,
'
content
'
:
blob_entry_ref
.
data
})
def
store_commit
(
repo
,
commit_to_store
):
"""
Store a commit in swh storage.
def
treewalk
(
repo
,
tree
,
topdown
=
True
):
"""
Walk a tree with the same implementation as `os.path`.
Returns: tree, trees, blobs
"""
store_tree
(
repo
,
commit_to_store
.
tree
)
store_object
(
commit_to_store
,
store
.
Type
.
commit
)
trees
,
blobs
=
[],
[]
for
tree_entry
in
tree
:
obj
=
repo
.
get
(
tree_entry
.
oid
,
None
)
if
obj
is
None
:
logging
.
warn
(
'
skip submodule-commit %s
'
%
tree_entry
.
hex
)
continue
# submodule!
if
obj
.
type
==
GIT_OBJ_TREE
:
trees
.
append
(
obj
)
else
:
blobs
.
append
(
obj
)
if
topdown
:
yield
tree
,
trees
,
blobs
for
tree_entry
in
trees
:
for
x
in
treewalk
(
repo
,
repo
[
tree_entry
.
oid
],
topdown
):
yield
x
if
not
topdown
:
yield
tree
,
trees
,
blobs
type_to
=
{
GIT_OBJ_BLOB
:
store
.
Type
.
blob
,
GIT_OBJ_TREE
:
store
.
Type
.
tree
,
GIT_OBJ_COMMIT
:
store
.
Type
.
commit
}
def
store_ref
(
sha1_hex
,
obj_ref
):
t
=
type_to
[
obj_ref
.
type
]
if
t
==
store
.
Type
.
blob
:
store_blob
(
obj_ref
,
sha1_hex
)
else
:
store_object
(
obj_ref
,
t
)
def
store_tree
(
repo
,
tree_ref
):
"""
Given a tree, w
alk the tree and save the blobs in file content storage
"""
W
alk the tree and save the blobs in file content storage
(if not already present).
"""
tree_sha1_hex
=
tree_ref
.
hex
#
tree_sha1_hex = tree_ref.hex
if
client
.
get
(
baseurl
,
store
.
Type
.
tree
,
tree_sha1_hex
):
logging
.
debug
(
'
skip tree %s
'
%
tree_sha1_hex
)
return
#
if client.get(baseurl, store.Type.tree, tree_sha1_hex):
#
logging.debug('skip tree %s' % tree_sha1_hex)
#
return
for
tree_entry
in
tree_ref
:
filemode
=
tree_entry
.
filemode
tree_id
=
tree_entry
.
id
sha1s_hex
=
[]
sha1s_map
=
{}
for
ori_tree_ref
,
trees_ref
,
blobs_ref
in
treewalk
(
repo
,
tree_ref
,
topdown
=
False
):
if
(
filemode
==
GIT_FILEMODE_COMMIT
):
# submodule!
logging
.
warn
(
'
skip submodule-commit %s
'
%
tree_id
)
continue
for
blob_ref
in
blobs_ref
:
blob_data_sha1hex
=
hash
.
hashkey_sha1
(
blob_ref
.
data
).
hexdigest
()
sha1s_hex
.
append
(
blob_data_sha1hex
)
sha1s_map
[
blob_data_sha1hex
]
=
blob_ref
# store_blob(blob_ref, blob_data_sha1hex)
for
tree_ref
in
trees_ref
:
sha1s_hex
.
append
(
tree_ref
.
hex
)
sha1s_map
[
tree_ref
.
hex
]
=
tree_ref
# store_object(tree_ref, store.Type.tree)
sha1s_hex
.
append
(
ori_tree_ref
.
hex
)
sha1s_map
[
ori_tree_ref
.
hex
]
=
ori_tree_ref
# store_object(ori_tree_ref, store.Type.tree)
elif
(
filemode
==
GIT_FILEMODE_TREE
):
# Tree
logging
.
debug
(
'
walk tree %s
'
%
tree_id
)
store_tree
(
repo
,
repo
[
tree_id
])
return
sha1s_hex
,
sha1s_map
else
:
# blob
blob_entry_ref
=
repo
[
tree_id
]
hashkey
=
hash
.
hashkey_sha1
(
blob_entry_ref
.
data
)
blob_data_sha1_hex
=
hashkey
.
hexdigest
()
# store_object(tree_ref, store.Type.tree)
def
store_commit
(
repo
,
commit_to_store
):
"""
Store a commit in swh storage.
"""
sha1s_hex
,
sha1s_map
=
store_tree
(
repo
,
commit_to_store
.
tree
)
sha1s_hex
.
append
(
commit_to_store
.
hex
)
sha1s_map
[
commit_to_store
.
hex
]
=
commit_to_store
if
client
.
get
(
baseurl
,
store
.
Type
.
blob
,
blob_data_sha1_hex
):
logging
.
debug
(
'
skip blob %s
'
%
blob_entry_ref
.
hex
)
continue
res
=
client
.
post
(
baseurl
,
{
'
sha1s
'
:
sha1s_hex
})
store_blob
(
blob_entry_ref
,
blob_data_sha1_hex
)
for
unknown_ref_sha1
in
res
:
store_ref
(
unknown_ref_sha1
,
sha1s_map
[
unknown_ref_sha1
]
)
store_object
(
tree_ref
,
store
.
Type
.
tree
)
def
walk_revision_from
(
repo
,
head_commit
,
visited
):
"""
Walk the revision from commit head_commit.
...
...
This diff is collapsed.
Click to expand it.
swh/http/client.py
+
21
−
0
View file @
65bbe015
...
...
@@ -7,6 +7,7 @@
# See top-level LICENSE file for more information
import
requests
import
json
from
retrying
import
retry
...
...
@@ -27,6 +28,12 @@ def compute_url(baseurl, type, sha1hex):
return
'
%s%s%s
'
%
(
baseurl
,
_api_url
[
type
],
sha1hex
)
def
compute_simple_url
(
baseurl
,
type
):
"""
Compute the api url.
"""
return
'
%s%s
'
%
(
baseurl
,
type
)
@retry
(
retry_on_exception
=
policy
.
retry_if_connection_error
,
wrap_exception
=
True
,
stop_max_attempt_number
=
3
)
...
...
@@ -46,3 +53,17 @@ def put(baseurl, type, sha1hex, data=None):
r
=
session_swh
.
put
(
compute_url
(
baseurl
,
type
,
sha1hex
),
[]
if
data
is
None
else
data
)
return
r
.
ok
@retry
(
retry_on_exception
=
policy
.
retry_if_connection_error
,
wrap_exception
=
True
,
stop_max_attempt_number
=
3
)
def
post
(
baseurl
,
sha1s
):
"""
Retrieve the objects of type type with sha1 sha1hex.
"""
url
=
compute_simple_url
(
baseurl
,
"
/objects/
"
)
r
=
session_swh
.
post
(
url
,
data
=
json
.
dumps
(
sha1s
),
headers
=
{
'
Content-type
'
:
'
application/json
'
})
result
=
r
.
json
()
return
result
[
'
sha1s
'
]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment