Skip to content

Commit cc34120

Browse files
authored
Merge pull request #31 from UncoderIO/spl-keywords-improvements
spl keywords improvements, refactoring
2 parents c51dfe7 + 36170f8 commit cc34120

File tree

14 files changed

+186
-149
lines changed

14 files changed

+186
-149
lines changed

translator/.gitignore

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,3 @@ test_data
197197

198198
# backup_logs
199199
.backup_logs
200-
201-
# sigmac tests stuff
202-
tactics.json
203-
techniques.json
File renamed without changes.
File renamed without changes.

translator/app/translator/core/tokenizer.py

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -43,35 +43,34 @@ def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
4343

4444

4545
class QueryTokenizer(BaseTokenizer):
46-
field_pattern = r"(?P<field_name>[a-zA-Z\._\-]+)"
47-
operator_pattern = r"\s?(?P<operator>and|or|not|AND|OR|NOT)\s?"
48-
field_value_pattern = r"""^___field___\s*___match_operator___\s*___value___"""
49-
match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>ilike|contains|endswith|startswith|in|>=|<=|==|>|<|=~|!=|=|:|\:))\s?"""
46+
single_value_operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
47+
multi_value_operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
48+
operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
49+
50+
logical_operator_pattern = r"\s?(?P<logical_operator>and|or|not|AND|OR|NOT)\s?"
51+
field_value_pattern = r"""^___field___\s*___operator___\s*___value___"""
5052
base_value_pattern = r"(?:___value_pattern___)"
51-
_value_pattern = r"""(?:\"|\')*(?P<value>[:a-zA-Z\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)(?:\*|\'|\"|\s|\$)*"""
52-
value_pattern = base_value_pattern.replace('___value_pattern___', _value_pattern)
53-
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)\)"""
54-
keyword_pattern = None # do not modify, use subclasses to define this attribute
5553

56-
multi_value_operators = tuple()
54+
# do not modify, use subclasses to define this attribute
55+
field_pattern: str = None
56+
_value_pattern: str = None
57+
value_pattern: str = None
58+
multi_value_pattern: str = None
59+
keyword_pattern: str = None
60+
5761
multi_value_delimiter = ","
5862
wildcard_symbol = None
5963

60-
operators_map = {
61-
"=": OperatorType.EQ,
62-
"in": OperatorType.EQ,
63-
"<": OperatorType.LT,
64-
"<=": OperatorType.LTE,
65-
">": OperatorType.GT,
66-
">=": OperatorType.GTE,
67-
"!=": OperatorType.NEQ,
68-
"contains": OperatorType.CONTAINS,
69-
"startswith": OperatorType.STARTSWITH,
70-
"endswith": OperatorType.ENDSWITH
71-
}
72-
7364
def __init_subclass__(cls, **kwargs):
65+
cls._validate_re_patterns()
7466
cls.value_pattern = cls.base_value_pattern.replace('___value_pattern___', cls._value_pattern)
67+
cls.operators_map = {**cls.single_value_operators_map, **cls.multi_value_operators_map}
68+
cls.operator_pattern = fr"""(?:___field___\s*(?P<operator>(?:{'|'.join(cls.operators_map)})))\s*"""
69+
70+
@classmethod
71+
def _validate_re_patterns(cls):
72+
if not all([cls.field_pattern, cls._value_pattern]):
73+
raise ValueError(f"{cls.__name__} re patterns must be set")
7574

7675
def map_operator(self, operator: str) -> str:
7776
try:
@@ -89,16 +88,16 @@ def search_field(self, query):
8988
def escape_field_name(self, field_name):
9089
return field_name.replace(".", r"\.")
9190

92-
def search_match_operator(self, query, field_name) -> str:
91+
def search_operator(self, query, field_name) -> str:
9392
field_name = self.escape_field_name(field_name)
94-
match_operator_pattern = self.match_operator_pattern.replace("___field___", field_name)
95-
match_operator_regex = re.compile(match_operator_pattern, re.IGNORECASE)
96-
match_operator_search = re.search(match_operator_regex, query)
97-
if match_operator_search is None:
93+
operator_pattern = self.operator_pattern.replace("___field___", field_name)
94+
compiled_operator_regex = re.compile(operator_pattern, re.IGNORECASE)
95+
if (operator_search := re.search(compiled_operator_regex, query)) is None:
9896
raise TokenizerGeneralException(error=f"Operator couldn't be found in query part: {query}")
99-
match_operator = match_operator_search.group("match_operator")
100-
match_operator = match_operator.strip(" ")
101-
return match_operator
97+
98+
operator = operator_search.group("operator")
99+
operator = operator.strip(" ")
100+
return operator
102101

103102
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:
104103
return operator, get_match_group(match, group_name='value')
@@ -118,7 +117,7 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
118117
field_value_pattern = self.get_field_value_pattern(operator, field_name)
119118
value_pattern = self.value_pattern
120119
is_multi = False
121-
if operator.lower() in self.multi_value_operators:
120+
if operator.lower() in self.multi_value_operators_map:
122121
value_pattern = self.multi_value_pattern
123122
is_multi = True
124123

@@ -142,7 +141,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
142141

143142
def get_field_value_pattern(self, operator, field_name):
144143
field_value_pattern = self.field_value_pattern.replace("___field___", self.escape_field_name(field_name))
145-
return field_value_pattern.replace("___match_operator___", operator)
144+
return field_value_pattern.replace("___operator___", operator)
146145

147146
@staticmethod
148147
def _clean_value(value: str, wildcard_symbol: str) -> str:
@@ -183,28 +182,45 @@ def create_field(field_name: str, operator: Identifier, value: Union[str, List])
183182

184183
def search_field_value(self, query):
185184
field_name = self.search_field(query)
186-
operator = self.search_match_operator(query, field_name)
185+
operator = self.search_operator(query, field_name)
187186
query, operator, value = self.search_value(query=query, operator=operator, field_name=field_name)
188187
value, operator_token = self.process_value_wildcard_symbols(value=value,
189188
operator=operator,
190189
wildcard_symbol=self.wildcard_symbol)
191190
field = self.create_field(field_name=field_name, operator=operator_token, value=value)
192191
return field, query
193192

194-
def __get_identifier(self, query: str) -> Tuple[Union[Field, Keyword, Identifier], str]:
193+
def _match_field_value(self, query: str, white_space_pattern: str = r"\s+") -> bool:
194+
single_value_operator_group = fr"(?:{'|'.join(self.single_value_operators_map)})"
195+
single_value_pattern = fr"""{self.field_pattern}\s*{single_value_operator_group}\s*{self.value_pattern}\s*"""
196+
if re.match(single_value_pattern, query, re.IGNORECASE):
197+
return True
198+
199+
if self.multi_value_operators_map:
200+
multi_value_operator_group = fr"(?:{'|'.join(self.multi_value_operators_map)})"
201+
pattern = f"{self.field_pattern}{white_space_pattern}{multi_value_operator_group}{white_space_pattern}"
202+
multi_value_pattern = fr"{pattern}{self.multi_value_pattern}"
203+
if re.match(multi_value_pattern, query, re.IGNORECASE):
204+
return True
205+
206+
return False
207+
208+
def _get_identifier(self, query: str) -> Tuple[Union[Field, Keyword, Identifier], str]:
195209
query = query.strip("\n").strip(" ").strip("\n")
196210
if query.startswith(GroupType.L_PAREN):
197211
return Identifier(token_type=GroupType.L_PAREN), query[1:]
198212
elif query.startswith(GroupType.R_PAREN):
199213
return Identifier(token_type=GroupType.R_PAREN), query[1:]
200-
elif operator_search := re.match(self.operator_pattern, query):
201-
operator = operator_search.group("operator")
202-
pos = operator_search.end()
203-
return Identifier(token_type=operator.lower()), query[pos:]
214+
elif logical_operator_search := re.match(self.logical_operator_pattern, query):
215+
logical_operator = logical_operator_search.group("logical_operator")
216+
pos = logical_operator_search.end()
217+
return Identifier(token_type=logical_operator.lower()), query[pos:]
218+
elif self._match_field_value(query):
219+
return self.search_field_value(query)
204220
elif self.keyword_pattern and re.match(self.keyword_pattern, query):
205221
return self.search_keyword(query)
206-
else:
207-
return self.search_field_value(query)
222+
223+
raise TokenizerGeneralException("Unsupported query entry")
208224

209225
@staticmethod
210226
def _validate_parentheses(tokens):
@@ -224,7 +240,7 @@ def _validate_parentheses(tokens):
224240
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
225241
tokenized = []
226242
while query:
227-
identifier, query = self.__get_identifier(query=query)
243+
identifier, query = self._get_identifier(query=query)
228244
tokenized.append(identifier)
229245
self._validate_parentheses(tokenized)
230246
return tokenized

translator/app/translator/platforms/athena/tokenizer.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,28 @@
2626

2727

2828
class AthenaTokenizer(QueryTokenizer):
29+
single_value_operators_map = {
30+
"=": OperatorType.EQ,
31+
"<=": OperatorType.LTE,
32+
"<": OperatorType.LT,
33+
">=": OperatorType.GTE,
34+
">": OperatorType.GT,
35+
"!=": OperatorType.NEQ,
36+
"<>": OperatorType.NEQ,
37+
"like": OperatorType.EQ
38+
}
39+
multi_value_operators_map = {
40+
"in": OperatorType.EQ
41+
}
42+
2943
field_pattern = r'(?P<field_name>"[a-zA-Z\._\-\s]+"|[a-zA-Z\._\-]+)'
30-
match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>like|in|<=|>=|==|>|<|<>|!=|=))\s?"""
3144
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3245
bool_value_pattern = r"(?P<bool_value>true|false)\s*"
3346
single_quotes_value_pattern = r"""'(?P<s_q_value>(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*)'"""
3447
_value_pattern = fr"{num_value_pattern}|{bool_value_pattern}|{single_quotes_value_pattern}"
3548
multi_value_pattern = r"""\((?P<value>\d+(?:,\s*\d+)*|'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*'(?:,\s*'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*')*)\)"""
3649

37-
multi_value_operators = ("in",)
3850
wildcard_symbol = "%"
39-
operators_map = {
40-
"like": OperatorType.EQ
41-
}
42-
43-
def __init__(self):
44-
super().__init__()
45-
self.operators_map.update(super().operators_map)
4651

4752
@staticmethod
4853
def should_process_value_wildcard_symbols(operator: str) -> bool:
@@ -62,7 +67,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
6267

6368
def search_field_value(self, query):
6469
field_name = self.search_field(query)
65-
operator = self.search_match_operator(query, field_name)
70+
operator = self.search_operator(query, field_name)
6671
should_process_value_wildcard_symbols = self.should_process_value_wildcard_symbols(operator)
6772
query, operator, value = self.search_value(query=query, operator=operator, field_name=field_name)
6873

translator/app/translator/platforms/base/lucene/const.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

translator/app/translator/platforms/base/lucene/tokenizer.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,34 +26,36 @@
2626
from app.translator.core.tokenizer import QueryTokenizer
2727
from app.translator.core.custom_types.tokens import OperatorType
2828
from app.translator.tools.utils import get_match_group
29-
from app.translator.platforms.base.lucene.const import COMPARISON_OPERATORS_MAP
3029

3130

3231
class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
32+
single_value_operators_map = {
33+
":>": OperatorType.GT,
34+
":<": OperatorType.LT,
35+
":": OperatorType.EQ
36+
}
37+
multi_value_operators_map = {
38+
":": OperatorType.EQ
39+
}
40+
3341
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_]+)"
3442
match_operator_pattern = r"(?:___field___\s*(?P<match_operator>:\[\*\sTO|:\[|:<|:>|:))\s*"
35-
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
43+
_num_value_pattern = r"\d+(?:\.\d+)*"
44+
num_value_pattern = fr"(?P<num_value>{_num_value_pattern})\s*"
3645
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
3746
no_quotes_value_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\\)+)\s*"
3847
re_value_pattern = r"/(?P<re_value>[:a-zA-Z\*0-9=+%#\\\-_\,\"\'\.$&^@!\(\)\{\}\[\]\s?]+)/\s*"
39-
_value_pattern = fr"{num_value_pattern}|{re_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}"
48+
gte_value_pattern = fr"\[\s*(?P<gte_value>{_num_value_pattern})\s+TO\s+\*\s*\]"
49+
lte_value_pattern = fr"\[\s*\*\s+TO\s+(?P<lte_value>{_num_value_pattern})\s*\]"
50+
range_value_pattern = fr"{gte_value_pattern}|{lte_value_pattern}"
51+
_value_pattern = fr"{num_value_pattern}|{re_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}|{range_value_pattern}"
4052
keyword_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\(|\\\)|\\\[|\\\]|\\\{|\\\}|\\\:|\\)+)(?:\s+|\)|$)"
4153

4254
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\[\]\s]+)\)"""
4355
multi_value_check_pattern = r"___field___\s*___operator___\s*\("
4456

4557
wildcard_symbol = "*"
4658

47-
operators_map = {
48-
":": OperatorType.EQ,
49-
":>": OperatorType.GT,
50-
":<": OperatorType.LT
51-
}
52-
53-
def __init__(self):
54-
super().__init__()
55-
self.operators_map.update(super().operators_map)
56-
5759
@staticmethod
5860
def create_field(field_name: str, operator: Identifier, value: Union[str, List]) -> Field:
5961
field_name = field_name.replace(".text", "")
@@ -79,11 +81,15 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
7981
elif (d_q_value := get_match_group(match, group_name='d_q_value')) is not None:
8082
return operator, d_q_value
8183

84+
elif (gte_value := get_match_group(match, group_name='gte_value')) is not None:
85+
return OperatorType.GTE, gte_value
86+
87+
elif (lte_value := get_match_group(match, group_name='lte_value')) is not None:
88+
return OperatorType.LTE, lte_value
89+
8290
return super().get_operator_and_value(match, operator)
8391

8492
def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str, str, Union[str, List[str]]]:
85-
if operator in COMPARISON_OPERATORS_MAP.keys():
86-
return self.search_value_gte_lte(query, operator, field_name)
8793
check_pattern = self.multi_value_check_pattern
8894
check_regex = check_pattern.replace('___field___', field_name).replace('___operator___', operator)
8995
if re.match(check_regex, query):
@@ -105,14 +111,6 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
105111
pos = field_value_search.end()
106112
return query[pos:], operator, value
107113

108-
def search_value_gte_lte(self, query: str, operator: str, field_name: str) -> Tuple[str, str, Union[str, List[str]]]:
109-
query_list = query.split("]")
110-
to_replace = [v for val in COMPARISON_OPERATORS_MAP.values() for v in val["replace"]]
111-
to_replace.append(field_name)
112-
regex = re.compile('|'.join(to_replace))
113-
value = re.sub(regex, '', query_list.pop(0))
114-
return "".join(query_list), COMPARISON_OPERATORS_MAP.get(operator, {}).get("default_op"), value.strip()
115-
116114
def search_keyword(self, query: str) -> Tuple[Keyword, str]:
117115
keyword_search = re.search(self.keyword_pattern, query)
118116
_, value = self.get_operator_and_value(keyword_search)
@@ -121,6 +119,14 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
121119
pos = keyword_search.end() - 1
122120
return keyword, query[pos:]
123121

122+
def _match_field_value(self, query: str, white_space_pattern: str = r"\s*") -> bool:
123+
range_value_pattern = f"(?:{self.gte_value_pattern}|{self.lte_value_pattern})"
124+
range_pattern = fr"{self.field_pattern}{white_space_pattern}:\s*{range_value_pattern}"
125+
if re.match(range_pattern, query, re.IGNORECASE):
126+
return True
127+
128+
return super()._match_field_value(query, white_space_pattern=white_space_pattern)
129+
124130
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
125131
tokens = super().tokenize(query=query)
126132
return self.add_and_token_if_missed(tokens=tokens)

translator/app/translator/platforms/base/spl/tokenizer.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,25 @@
2828

2929

3030
class SplTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
31+
single_value_operators_map = {
32+
"=": OperatorType.EQ,
33+
"<=": OperatorType.LTE,
34+
"<": OperatorType.LT,
35+
">=": OperatorType.GTE,
36+
">": OperatorType.GT,
37+
"!=": OperatorType.NEQ
38+
}
39+
multi_value_operators_map = {"in": OperatorType.EQ}
40+
3141
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_\{\}]+)"
32-
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
33-
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
42+
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)(?=$|\s|\))"
43+
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\]\[\(\)\{\}\s]|\\\"|\\)*)"\s*'
3444
single_quotes_value_pattern = r"'(?P<s_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\"\.$&^@!\(\)\{\}\s]|\\\'|\\)*)'\s*"
35-
no_quotes_value = r"(?P<no_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\.\\$&^@!])+)\s*"
36-
_value_pattern = fr"{num_value_pattern}|{no_quotes_value}|{double_quotes_value_pattern}|{single_quotes_value_pattern}"
45+
no_quotes_value_pattern = r"(?P<no_q_value>(?:[:a-zA-Z\*0-9+%#\-_/,\.$&^@!]|\\\s|\\=|\\!=|\\<|\\<=|\\>|\\>=|\\\\)+)(?=$|\s|\))"
46+
_value_pattern = fr"{num_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}|{single_quotes_value_pattern}"
3747
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,;.$&^@!\{\}\(\s]+)\)"""
38-
keyword_pattern = double_quotes_value_pattern
48+
keyword_pattern = fr"{double_quotes_value_pattern}|{no_quotes_value_pattern}"
3949

40-
multi_value_operators = ("in",)
4150
wildcard_symbol = "*"
4251

4352
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:

translator/app/translator/platforms/chronicle/tokenizer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,16 @@
2626

2727

2828
class ChronicleQueryTokenizer(QueryTokenizer):
29-
field_pattern = r"(?P<field_name>[a-zA-Z0-9\._]+)"
29+
single_value_operators_map = {
30+
"=": OperatorType.EQ,
31+
"<=": OperatorType.LTE,
32+
"<": OperatorType.LT,
33+
">=": OperatorType.GTE,
34+
">": OperatorType.GT,
35+
"!=": OperatorType.NEQ
36+
}
3037

38+
field_pattern = r"(?P<field_name>[a-zA-Z0-9\._]+)"
3139
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3240
bool_value_pattern = r"(?P<bool_value>true|false)\s*"
3341
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\\\)*)"\s*(?:nocase)?'

0 commit comments

Comments
 (0)