Merge pull request #18 from UncoderIO/space-as-and

saltar-ua · web-flow · commit 441545e4d392 · 2023-12-04T12:08:10.000+02:00
Interpret a space as and
diff --git a/siem-converter/app/converter/core/mixins/logic.py b/siem-converter/app/converter/core/mixins/logic.py
@@ -0,0 +1,27 @@
+from typing import List, Union
+
+from app.converter.core.models.field import Field, Keyword
+from app.converter.core.models.identifier import Identifier
+from app.converter.core.custom_types.tokens import LogicalOperatorType, GroupType
+
+
+class ANDLogicOperatorMixin:
+
+    @staticmethod
+    def get_missed_and_token_indices(tokens: List[Union[Field, Keyword, Identifier]]) -> List[int]:
+        missed_and_indices = []
+        for index in range(len(tokens) - 1):
+            token = tokens[index]
+            next_token = tokens[index + 1]
+            if (isinstance(token, (Field, Keyword))
+                    and not (isinstance(next_token, Identifier) and (
+                                    next_token.token_type in LogicalOperatorType
+                                    or next_token.token_type == GroupType.R_PAREN))):
+                missed_and_indices.append(index + 1)
+        return reversed(missed_and_indices)
+
+    def add_and_token_if_missed(self, tokens: List[Union[Field, Keyword, Identifier]]) -> List[Union[Field, Keyword, Identifier]]:
+        indices = self.get_missed_and_token_indices(tokens=tokens)
+        for index in indices:
+            tokens.insert(index, Identifier(token_type=LogicalOperatorType.AND))
+        return tokens
diff --git a/siem-converter/app/converter/platforms/base/lucene/tokenizer.py b/siem-converter/app/converter/platforms/base/lucene/tokenizer.py
@@ -20,14 +20,15 @@
 from typing import Tuple, Union, List, Any
 
 from app.converter.core.exceptions.parser import TokenizerGeneralException
+from app.converter.core.mixins.logic import ANDLogicOperatorMixin
 from app.converter.core.models.field import Keyword, Field
 from app.converter.core.models.identifier import Identifier
 from app.converter.core.tokenizer import QueryTokenizer
 from app.converter.core.custom_types.tokens import OperatorType
 from app.converter.tools.utils import get_match_group
 
 
-class LuceneTokenizer(QueryTokenizer):
+class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
     field_pattern = r"(?P<field_name>[a-zA-Z\.\-_]+)"
     match_operator_pattern = r"(?:___field___\s*(?P<match_operator>:))\s*"
 
@@ -107,3 +108,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
         keyword = Keyword(value=value)
         pos = keyword_search.end() - 1
         return keyword, query[pos:]
+
+    def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
+        tokens = super().tokenize(query=query)
+        return self.add_and_token_if_missed(tokens=tokens)
diff --git a/siem-converter/app/converter/platforms/base/spl/tokenizer.py b/siem-converter/app/converter/platforms/base/spl/tokenizer.py
@@ -17,14 +17,17 @@
 """
 
 import re
-from typing import Tuple, Any
+from typing import Tuple, Any, List, Union
 
+from app.converter.core.mixins.logic import ANDLogicOperatorMixin
+from app.converter.core.models.field import Field, Keyword
+from app.converter.core.models.identifier import Identifier
 from app.converter.core.tokenizer import QueryTokenizer
 from app.converter.core.custom_types.tokens import OperatorType
 from app.converter.tools.utils import get_match_group
 
 
-class SplTokenizer(QueryTokenizer):
+class SplTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
     field_pattern = r"(?P<field_name>[a-zA-Z\.\-_\{\}]+)"
     num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
     double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
@@ -51,3 +54,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
             return operator, s_q_value
 
         return super().get_operator_and_value(match)
+
+    def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
+        tokens = super().tokenize(query=query)
+        return self.add_and_token_if_missed(tokens=tokens)
diff --git a/siem-converter/app/converter/platforms/logscale/tokenizer.py b/siem-converter/app/converter/platforms/logscale/tokenizer.py
@@ -17,16 +17,17 @@
 """
 
 import re
-from typing import Tuple, Any
+from typing import Tuple, Any, List, Union
 
+from app.converter.core.mixins.logic import ANDLogicOperatorMixin
 from app.converter.core.models.field import Keyword, Field
 from app.converter.core.models.identifier import Identifier
 from app.converter.core.custom_types.tokens import GroupType, LogicalOperatorType, OperatorType
 from app.converter.core.tokenizer import QueryTokenizer
 from app.converter.tools.utils import get_match_group
 
 
-class LogScaleTokenizer(QueryTokenizer):
+class LogScaleTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
     match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>=|!=))\s?"""
     num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
     double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
@@ -65,7 +66,7 @@ def __get_identifier(self, query: str) -> (list, str):
         else:
             return self.search_field_value(query)
 
-    def tokenize(self, query: str) -> list:
+    def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
         tokenized = []
         while query:
             identifier, query = self.__get_identifier(query=query)
@@ -78,4 +79,4 @@ def tokenize(self, query: str) -> list:
                         tokenized.append(Identifier(token_type=LogicalOperatorType.AND))
             tokenized.append(identifier)
         self._validate_parentheses(tokenized)
-        return tokenized
+        return self.add_and_token_if_missed(tokens=tokenized)