Skip to content
Snippets Groups Projects
Commit d52afd0e authored by Daniele Serafini's avatar Daniele Serafini
Browse files

scanner: option to exclude specific patterns

The scanner will create a set containing for each pattern
one regex object, compiled from the regex generated from
fnmatch.translate.
parent 62be9f10
No related branches found
No related tags found
1 merge request!12scanner: exclusion list through glob patterns
......@@ -5,10 +5,15 @@
import click
import asyncio
import glob
import re
import fnmatch
from pathlib import PosixPath
from typing import Tuple
from .scanner import run
from .model import Tree
from .exceptions import InvalidDirectoryPath
from swh.core.cli import CONTEXT_SETTINGS
......@@ -28,8 +33,30 @@ def parse_url(url):
return url
def extract_regex_objs(root_path: PosixPath, patterns: Tuple[str]) -> object:
"""Generates a regex object for each pattern given in input and checks if
the path is a subdirectory or relative to the root path.
Yields:
an SRE_Pattern object
"""
for pattern in patterns:
for path in glob.glob(pattern):
dirpath = PosixPath(path)
if root_path not in dirpath.parents:
error_msg = (
f'The path "{dirpath}" is not a subdirectory or relative '
f'to the root directory path: "{root_path}"'
)
raise InvalidDirectoryPath(error_msg)
if glob.glob(pattern):
regex = fnmatch.translate(str(PosixPath(pattern)))
yield re.compile(regex)
@scanner.command(name="scan")
@click.argument("path", required=True, type=click.Path(exists=True))
@click.argument("root_path", required=True, type=click.Path(exists=True))
@click.option(
"-u",
"--api-url",
......@@ -38,6 +65,14 @@ def parse_url(url):
show_default=True,
help="url for the api request",
)
@click.option(
"--exclude",
"-x",
"patterns",
metavar="PATTERN",
multiple=True,
help="recursively exclude a specific pattern",
)
@click.option(
"-f",
"--format",
......@@ -46,14 +81,19 @@ def parse_url(url):
help="select the output format",
)
@click.pass_context
def scan(ctx, path, api_url, format):
def scan(ctx, root_path, api_url, patterns, format):
"""Scan a source code project to discover files and directories already
present in the archive"""
sre_patterns = set()
if patterns:
sre_patterns = {
reg_obj for reg_obj in extract_regex_objs(PosixPath(root_path), patterns)
}
api_url = parse_url(api_url)
source_tree = Tree(PosixPath(path))
source_tree = Tree(PosixPath(root_path))
loop = asyncio.get_event_loop()
loop.run_until_complete(run(path, api_url, source_tree))
loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns))
source_tree.show(format)
......
......@@ -8,6 +8,10 @@ class InvalidObjectType(TypeError):
pass
class InvalidDirectoryPath(Exception):
pass
class APIError(Exception):
def __str__(self):
return '"%s"' % self.args
......
......@@ -7,14 +7,19 @@ import os
import itertools
import asyncio
import aiohttp
from typing import List, Dict, Tuple, Iterator
from typing import List, Dict, Tuple, Iterator, Union, Set, Any
from pathlib import PosixPath
from .exceptions import error_response
from .model import Tree
from swh.model.cli import pid_of_file, pid_of_dir
from swh.model.identifiers import parse_persistent_identifier, DIRECTORY, CONTENT
from swh.model.from_disk import Directory, Content, accept_all_directories
from swh.model.identifiers import (
persistent_identifier,
parse_persistent_identifier,
DIRECTORY,
CONTENT,
)
async def pids_discovery(
......@@ -61,7 +66,26 @@ async def pids_discovery(
return await make_request(pids)
def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]:
def directory_filter(path_name: Union[str, bytes], exclude_patterns: Set[Any]) -> bool:
"""It checks if the path_name is matching with the patterns given in input.
It is also used as a `dir_filter` function when generating the directory
object from `swh.model.from_disk`
Returns:
False if the directory has to be ignored, True otherwise
"""
path = PosixPath(path_name.decode() if isinstance(path_name, bytes) else path_name)
for sre_pattern in exclude_patterns:
if sre_pattern.match(str(path)):
return False
return True
def get_subpaths(
path: PosixPath, exclude_patterns: Set[Any]
) -> Iterator[Tuple[PosixPath, str]]:
"""Find the persistent identifier of the directories and files under a
given path.
......@@ -75,9 +99,22 @@ def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]:
def pid_of(path):
if path.is_dir():
return pid_of_dir(bytes(path))
elif path.is_file() or path.is_symlink():
return pid_of_file(bytes(path))
if exclude_patterns:
def dir_filter(dirpath, *args):
return directory_filter(dirpath, exclude_patterns)
else:
dir_filter = accept_all_directories
obj = Directory.from_disk(
path=bytes(path), dir_filter=dir_filter
).get_data()
return persistent_identifier(DIRECTORY, obj)
else:
obj = Content.from_file(path=bytes(path)).get_data()
return persistent_identifier(CONTENT, obj)
dirpath, dnames, fnames = next(os.walk(path))
for node in itertools.chain(dnames, fnames):
......@@ -86,7 +123,10 @@ def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]:
async def parse_path(
path: PosixPath, session: aiohttp.ClientSession, api_url: str
path: PosixPath,
session: aiohttp.ClientSession,
api_url: str,
exclude_patterns: Set[Any],
) -> Iterator[Tuple[str, str, bool]]:
"""Check if the sub paths of the given path are present in the
archive or not.
......@@ -100,7 +140,7 @@ async def parse_path(
the pid of the subpath and the result of the api call
"""
parsed_paths = dict(get_subpaths(path))
parsed_paths = dict(get_subpaths(path, exclude_patterns))
parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url)
def unpack(tup):
......@@ -110,7 +150,9 @@ async def parse_path(
return map(unpack, parsed_paths.items())
async def run(root: PosixPath, api_url: str, source_tree: Tree) -> None:
async def run(
root: PosixPath, api_url: str, source_tree: Tree, exclude_patterns: Set[Any]
) -> None:
"""Start scanning from the given root.
It fills the source tree with the path discovered.
......@@ -121,18 +163,20 @@ async def run(root: PosixPath, api_url: str, source_tree: Tree) -> None:
"""
async def _scan(root, session, api_url, source_tree):
for path, pid, found in await parse_path(root, session, api_url):
async def _scan(root, session, api_url, source_tree, exclude_patterns):
for path, pid, found in await parse_path(
root, session, api_url, exclude_patterns
):
obj_type = parse_persistent_identifier(pid).object_type
if obj_type == CONTENT:
source_tree.addNode(path, pid if found else None)
elif obj_type == DIRECTORY:
elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
if found:
source_tree.addNode(path, pid)
else:
source_tree.addNode(path)
await _scan(path, session, api_url, source_tree)
await _scan(path, session, api_url, source_tree, exclude_patterns)
async with aiohttp.ClientSession() as session:
await _scan(root, session, api_url, source_tree)
await _scan(root, session, api_url, source_tree, exclude_patterns)
......@@ -14,4 +14,5 @@ present_pids = [
"swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md
"swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary
"swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/
"swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/
]
{
"foo": {
"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
},
"bar": {
"barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
},
"link-to-foo": {
"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
},
"some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
}
{"foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "bar": {"barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"}, "link-to-foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"}
{
"foo": {
"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
},
"bar": {
"barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
},
"link-to-foo": {
"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
},
"toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",
"some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
}
example file
import pytest
from swh.scanner.cli import extract_regex_objs
from swh.scanner.exceptions import InvalidDirectoryPath
def test_extract_regex_objs(temp_folder):
root_path = temp_folder["root"]
patterns = (str(temp_folder["subdir"]), "/none")
sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)]
assert len(sre_patterns) == 1
patterns = (*patterns, "/tmp")
with pytest.raises(InvalidDirectoryPath):
sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)]
......@@ -11,6 +11,7 @@ from .data import correct_api_response
from swh.scanner.scanner import pids_discovery, get_subpaths, run
from swh.scanner.model import Tree
from swh.scanner.cli import extract_regex_objs
from swh.scanner.exceptions import APIError
aio_url = "http://example.org/api/known/"
......@@ -51,13 +52,17 @@ def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_se
event_loop.run_until_complete(pids_discovery(request, aiosession, api_url))
def test_scanner_get_subpaths(temp_folder, tmp_path):
paths = temp_folder["paths"].keys()
pids = temp_folder["paths"].values()
def test_scanner_get_subpaths(temp_folder):
root = temp_folder["root"]
for subpath, pid in get_subpaths(tmp_path):
assert subpath in paths
assert pid in pids
actual_result = []
for subpath, pid in get_subpaths(root, tuple()):
# also check if it's a symlink since pytest tmp_dir fixture create
# also a symlink to each directory inside the tmp_dir path
if subpath.is_dir() and not subpath.is_symlink():
actual_result.append((subpath, pid))
assert len(actual_result) == 2
@pytest.mark.options(debug=False)
......@@ -75,7 +80,32 @@ def test_scanner_result(live_server, event_loop, test_folder):
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
source_tree = Tree(sample_folder)
event_loop.run_until_complete(run(sample_folder, api_url, source_tree))
event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple()))
actual_result = source_tree.getTree()
assert actual_result == expected_result
def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
result_path = test_folder.joinpath(
PosixPath("sample-folder-result-no-toexclude.json")
)
with open(result_path, "r") as json_file:
expected_result = json.loads(json_file.read())
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
patterns = (str(sample_folder) + "/toexclude",)
exclude_pattern = {
reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns)
}
source_tree = Tree(sample_folder)
event_loop.run_until_complete(
run(sample_folder, api_url, source_tree, exclude_pattern)
)
actual_result = source_tree.getTree()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment