Skip to content

Commit 441545e

Browse files
authored
Merge pull request #18 from UncoderIO/space-as-and
Interpret a space as and
2 parents 5bde75e + f72ca55 commit 441545e

File tree

4 files changed

+47
-7
lines changed

4 files changed

+47
-7
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from typing import List, Union
2+
3+
from app.converter.core.models.field import Field, Keyword
4+
from app.converter.core.models.identifier import Identifier
5+
from app.converter.core.custom_types.tokens import LogicalOperatorType, GroupType
6+
7+
8+
class ANDLogicOperatorMixin:
9+
10+
@staticmethod
11+
def get_missed_and_token_indices(tokens: List[Union[Field, Keyword, Identifier]]) -> List[int]:
12+
missed_and_indices = []
13+
for index in range(len(tokens) - 1):
14+
token = tokens[index]
15+
next_token = tokens[index + 1]
16+
if (isinstance(token, (Field, Keyword))
17+
and not (isinstance(next_token, Identifier) and (
18+
next_token.token_type in LogicalOperatorType
19+
or next_token.token_type == GroupType.R_PAREN))):
20+
missed_and_indices.append(index + 1)
21+
return reversed(missed_and_indices)
22+
23+
def add_and_token_if_missed(self, tokens: List[Union[Field, Keyword, Identifier]]) -> List[Union[Field, Keyword, Identifier]]:
24+
indices = self.get_missed_and_token_indices(tokens=tokens)
25+
for index in indices:
26+
tokens.insert(index, Identifier(token_type=LogicalOperatorType.AND))
27+
return tokens

siem-converter/app/converter/platforms/base/lucene/tokenizer.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
from typing import Tuple, Union, List, Any
2121

2222
from app.converter.core.exceptions.parser import TokenizerGeneralException
23+
from app.converter.core.mixins.logic import ANDLogicOperatorMixin
2324
from app.converter.core.models.field import Keyword, Field
2425
from app.converter.core.models.identifier import Identifier
2526
from app.converter.core.tokenizer import QueryTokenizer
2627
from app.converter.core.custom_types.tokens import OperatorType
2728
from app.converter.tools.utils import get_match_group
2829

2930

30-
class LuceneTokenizer(QueryTokenizer):
31+
class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
3132
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_]+)"
3233
match_operator_pattern = r"(?:___field___\s*(?P<match_operator>:))\s*"
3334

@@ -107,3 +108,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
107108
keyword = Keyword(value=value)
108109
pos = keyword_search.end() - 1
109110
return keyword, query[pos:]
111+
112+
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
113+
tokens = super().tokenize(query=query)
114+
return self.add_and_token_if_missed(tokens=tokens)

siem-converter/app/converter/platforms/base/spl/tokenizer.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717
"""
1818

1919
import re
20-
from typing import Tuple, Any
20+
from typing import Tuple, Any, List, Union
2121

22+
from app.converter.core.mixins.logic import ANDLogicOperatorMixin
23+
from app.converter.core.models.field import Field, Keyword
24+
from app.converter.core.models.identifier import Identifier
2225
from app.converter.core.tokenizer import QueryTokenizer
2326
from app.converter.core.custom_types.tokens import OperatorType
2427
from app.converter.tools.utils import get_match_group
2528

2629

27-
class SplTokenizer(QueryTokenizer):
30+
class SplTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
2831
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_\{\}]+)"
2932
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3033
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
@@ -51,3 +54,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
5154
return operator, s_q_value
5255

5356
return super().get_operator_and_value(match)
57+
58+
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
59+
tokens = super().tokenize(query=query)
60+
return self.add_and_token_if_missed(tokens=tokens)

siem-converter/app/converter/platforms/logscale/tokenizer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,17 @@
1717
"""
1818

1919
import re
20-
from typing import Tuple, Any
20+
from typing import Tuple, Any, List, Union
2121

22+
from app.converter.core.mixins.logic import ANDLogicOperatorMixin
2223
from app.converter.core.models.field import Keyword, Field
2324
from app.converter.core.models.identifier import Identifier
2425
from app.converter.core.custom_types.tokens import GroupType, LogicalOperatorType, OperatorType
2526
from app.converter.core.tokenizer import QueryTokenizer
2627
from app.converter.tools.utils import get_match_group
2728

2829

29-
class LogScaleTokenizer(QueryTokenizer):
30+
class LogScaleTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
3031
match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>=|!=))\s?"""
3132
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3233
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
@@ -65,7 +66,7 @@ def __get_identifier(self, query: str) -> (list, str):
6566
else:
6667
return self.search_field_value(query)
6768

68-
def tokenize(self, query: str) -> list:
69+
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
6970
tokenized = []
7071
while query:
7172
identifier, query = self.__get_identifier(query=query)
@@ -78,4 +79,4 @@ def tokenize(self, query: str) -> list:
7879
tokenized.append(Identifier(token_type=LogicalOperatorType.AND))
7980
tokenized.append(identifier)
8081
self._validate_parentheses(tokenized)
81-
return tokenized
82+
return self.add_and_token_if_missed(tokens=tokenized)

0 commit comments

Comments
 (0)