From 17f04939f4fd44684e223c518d705619d2d83fee Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com> Date: Wed, 8 Jun 2016 15:09:18 +0200 Subject: [PATCH] Open a new walk_and_compute_sha1_from_directory_2 api This actually is supposed to replace walk_and_compute_sha1_from_directory. The data structure used here is better at handling updates. (Code that actually got lost and rewritten - Tests are definitely lost though) --- swh/model/git.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/swh/model/git.py b/swh/model/git.py index 4ed9036e..f9b7b275 100644 --- a/swh/model/git.py +++ b/swh/model/git.py @@ -338,7 +338,10 @@ def walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn=default_validation_dir, with_root_tree=True, remove_empty_folder=False): - """Compute git sha1 from directory rootdir. + """(Deprecated) TODO migrate the code to + walk_and_compute_sha1_from_directory_2. + + Compute git sha1 from directory rootdir. Args: - rootdir: Root directory from which beginning the git hash computation @@ -417,8 +420,120 @@ def walk_and_compute_sha1_from_directory(rootdir, return ls_hashes +def walk_and_compute_sha1_from_directory_2(rootdir, + dir_ok_fn=default_validation_dir, + remove_empty_folder=False): + """Compute git sha1 from directory rootdir. + + Args: + - rootdir: Root directory from which beginning the git hash + computation + + - dir_ok_fn: Filter function to filter directory according to rules + defined in the function. By default, all folders are ok. + Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath + + Returns: + Dictionary of entries with keys absolute path name. + Path-name can be a file/link or directory. + The associated value is a dictionary with: + - checksums: the dictionary with the hashes for the link/file/dir + Those are list of dictionary with keys: + - 'perms' + - 'type' + - 'name' + - 'sha1_git' + - and specifically content: 'sha1', 'sha256', ... + + - children: Only for a directory, the set of children paths + + Note: + One special key is the / which indicates the upper root of + the directory (this is the revision's directory). + + Raises: + Nothing + If something is raised, this is a programmatic error. + + """ + def __get_dict_from_dirpath(_dict, path): + """Retrieve the default associated value for key path. + + """ + return _dict.get(path, dict(children=set(), checksums=None)) + + def __get_dict_from_filepath(_dict, path): + """Retrieve the default associated value for key path. + + """ + return _dict.get(path, dict(checksums=None)) + + ls_hashes = {} + all_links = set() + + if rootdir.endswith(b'/'): + rootdir = rootdir.rstrip(b'/') + + for dirpath, dirnames, filenames in __walk( + rootdir, dir_ok_fn, remove_empty_folder): + + dir_entry = __get_dict_from_dirpath(ls_hashes, dirpath) + children = dir_entry['children'] + + links = (file + for file in filenames.union(dirnames) + if os.path.islink(file)) + + for linkpath in links: + all_links.add(linkpath) + m_hashes = compute_link_metadata(linkpath) + d = __get_dict_from_filepath(ls_hashes, linkpath) + d['checksums'] = m_hashes + ls_hashes[linkpath] = d + children.add(linkpath) + + for filepath in (file for file in filenames if file not in all_links): + m_hashes = compute_blob_metadata(filepath) + d = __get_dict_from_filepath(ls_hashes, filepath) + d['checksums'] = m_hashes + ls_hashes[filepath] = d + children.add(filepath) + + for fulldirname in (dir for dir in dirnames if dir not in all_links): + d_hashes = __get_dict_from_dirpath(ls_hashes, fulldirname) + tree_hash = _compute_tree_metadata( + fulldirname, + (ls_hashes[p]['checksums'] for p in d_hashes['children']) + ) + d = __get_dict_from_dirpath(ls_hashes, fulldirname) + d['checksums'] = tree_hash + ls_hashes[fulldirname] = d + children.add(fulldirname) + + dir_entry['children'] = children + ls_hashes[dirpath] = dir_entry + + # compute the current directory hashes + d_hashes = __get_dict_from_dirpath(ls_hashes, rootdir) + root_hash = { + 'sha1_git': _compute_directory_git_sha1( + (ls_hashes[p]['checksums'] for p in d_hashes['children']) + ), + 'path': rootdir, + 'name': os.path.basename(rootdir), + 'perms': GitPerm.TREE, + 'type': GitType.TREE + } + d_hashes['checksums'] = root_hash + ls_hashes[rootdir] = d_hashes + + return ls_hashes + + def recompute_sha1_in_memory(root, deeper_rootdir, objects): - """Recompute git sha1 from directory deeper_rootdir to root. + """TODO: Use git.walk_and_compute_sha1_from_directory_2 + + Recompute git sha1 from directory deeper_rootdir to root. This function relies exclusively on `objects` for hashes. It expects the deeper_rootdir and every key below that path to be -- GitLab