Skip to content

Commit cf9753e

Browse files
authored
Merge pull request #36 from UncoderIO/escaping_logic
Added escaping logic support
2 parents c0a4420 + 5b57082 commit cf9753e

File tree

27 files changed

+338
-161
lines changed

27 files changed

+338
-161
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from app.translator.tools.custom_enum import CustomEnum
2+
3+
4+
class ValueType(CustomEnum):
5+
value = "value"
6+
number_value = "num_value"
7+
double_quotes_value = "d_q_value"
8+
single_quotes_value = "s_q_value"
9+
back_quotes_value = "b_q_value"
10+
no_quotes_value = "no_q_value"
11+
bool_value = "bool_value"
12+
regular_expression_value = "re_value"
13+
greater_than_or_equal = "gte_value"
14+
less_than_or_equal = "lte_value"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import re
2+
from abc import ABC
3+
from typing import Union
4+
5+
from app.translator.core.custom_types.values import ValueType
6+
from app.translator.core.models.escape_details import EscapeDetails
7+
8+
9+
class EscapeManager(ABC):
10+
escape_map: dict[str, EscapeDetails] = {}
11+
12+
def escape(self, value: Union[str, int], value_type: str = ValueType.value) -> Union[str, int]:
13+
if isinstance(value, int):
14+
return value
15+
if escape_details := self.escape_map.get(value_type):
16+
symbols_pattern = re.compile(escape_details.pattern)
17+
value = symbols_pattern.sub(escape_details.escape_symbols, value)
18+
return value
19+
20+
def remove_escape(self, value: Union[str, int]) -> Union[str, int]:
21+
if isinstance(value, int):
22+
return value
23+
value = value.encode().decode("unicode_escape")
24+
return value
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class EscapeDetails:
6+
pattern: str = None
7+
escape_symbols: str = "\\\\\g<1>"

translator/app/translator/core/render.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from typing import Union, List, Dict
2222

2323
from app.translator.const import DEFAULT_VALUE_TYPE
24+
from app.translator.core.custom_types.values import ValueType
25+
from app.translator.core.escape_manager import EscapeManager
2426
from app.translator.core.exceptions.core import NotImplementedException, StrictPlatformException
2527
from app.translator.core.exceptions.parser import UnsupportedOperatorException
2628
from app.translator.core.functions import PlatformFunctions
@@ -34,6 +36,7 @@
3436

3537
class BaseQueryFieldValue(ABC):
3638
details: PlatformDetails = None
39+
escape_manager: EscapeManager = None
3740

3841
def __init__(self, or_token):
3942
self.field_value = {
@@ -84,6 +87,10 @@ def regex_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
8487
def keywords(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
8588
raise NotImplementedException
8689

90+
def apply_value(self, value: Union[str, int], value_type: str = ValueType.value) -> Union[str, int]:
91+
updated_value = self.escape_manager.escape(value, value_type)
92+
return updated_value
93+
8794
def apply_field_value(self, field, operator, value):
8895
if modifier_function := self.field_value.get(operator.token_type):
8996
return modifier_function(field, value)

translator/app/translator/core/tokenizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import re
2121
from typing import Tuple, Union, List, Any, Optional, Type
2222

23+
from app.translator.core.custom_types.values import ValueType
24+
from app.translator.core.escape_manager import EscapeManager
2325
from app.translator.core.exceptions.parser import (
2426
UnsupportedOperatorException,
2527
TokenizerGeneralException,
@@ -60,6 +62,7 @@ class QueryTokenizer(BaseTokenizer):
6062

6163
multi_value_delimiter = ","
6264
wildcard_symbol = None
65+
escape_manager: EscapeManager = None
6366

6467
def __init_subclass__(cls, **kwargs):
6568
cls._validate_re_patterns()
@@ -100,7 +103,7 @@ def search_operator(self, query, field_name) -> str:
100103
return operator
101104

102105
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:
103-
return operator, get_match_group(match, group_name='value')
106+
return operator, get_match_group(match, group_name=ValueType.value)
104107

105108
@staticmethod
106109
def clean_multi_value(value: Union[int, str]) -> Union[int, str]:

translator/app/translator/platforms/athena/tokenizer.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import re
2020
from typing import Tuple, Any
2121

22+
from app.translator.core.custom_types.values import ValueType
2223
from app.translator.core.models.identifier import Identifier
2324
from app.translator.core.tokenizer import QueryTokenizer
2425
from app.translator.core.custom_types.tokens import OperatorType
@@ -41,11 +42,11 @@ class AthenaTokenizer(QueryTokenizer):
4142
}
4243

4344
field_pattern = r'(?P<field_name>"[a-zA-Z\._\-\s]+"|[a-zA-Z\._\-]+)'
44-
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
45-
bool_value_pattern = r"(?P<bool_value>true|false)\s*"
46-
single_quotes_value_pattern = r"""'(?P<s_q_value>(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*)'"""
45+
num_value_pattern = fr"(?P<{ValueType.number_value}>\d+(?:\.\d+)*)\s*"
46+
bool_value_pattern = fr"(?P<{ValueType.bool_value}>true|false)\s*"
47+
single_quotes_value_pattern = fr"""'(?P<{ValueType.single_quotes_value}>(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{{\}}\s]|'')*)'"""
4748
_value_pattern = fr"{num_value_pattern}|{bool_value_pattern}|{single_quotes_value_pattern}"
48-
multi_value_pattern = r"""\((?P<value>\d+(?:,\s*\d+)*|'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*'(?:,\s*'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*')*)\)"""
49+
multi_value_pattern = fr"""\((?P<{ValueType.value}>\d+(?:,\s*\d+)*|'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{{\}}\s]|'')*'(?:,\s*'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{{\}}\s]|'')*')*)\)"""
4950

5051
wildcard_symbol = "%"
5152

@@ -54,13 +55,13 @@ def should_process_value_wildcard_symbols(operator: str) -> bool:
5455
return operator.lower() in ("like",)
5556

5657
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:
57-
if (num_value := get_match_group(match, group_name='num_value')) is not None:
58+
if (num_value := get_match_group(match, group_name=ValueType.number_value)) is not None:
5859
return operator, num_value
5960

60-
elif (bool_value := get_match_group(match, group_name='bool_value')) is not None:
61+
elif (bool_value := get_match_group(match, group_name=ValueType.bool_value)) is not None:
6162
return operator, bool_value
6263

63-
elif (s_q_value := get_match_group(match, group_name='s_q_value')) is not None:
64+
elif (s_q_value := get_match_group(match, group_name=ValueType.single_quotes_value)) is not None:
6465
return operator, s_q_value
6566

6667
return super().get_operator_and_value(match, operator)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from app.translator.core.custom_types.values import ValueType
2+
from app.translator.core.escape_manager import EscapeManager
3+
from app.translator.core.models.escape_details import EscapeDetails
4+
5+
6+
class LuceneEscapeManager(EscapeManager):
7+
escape_map = {
8+
ValueType.value: EscapeDetails(pattern=r'([_!@#$%^&*=+()\[\]{}|;:\'",.<>?/`~\-\s\\])', escape_symbols=r"\\\1")
9+
}
10+
11+
12+
lucene_escape_manager = LuceneEscapeManager()

translator/app/translator/platforms/base/lucene/renders/lucene.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,15 @@
2121
from app.translator.const import DEFAULT_VALUE_TYPE
2222
from app.translator.core.render import BaseQueryRender
2323
from app.translator.core.render import BaseQueryFieldValue
24+
from app.translator.platforms.base.lucene.escape_manager import lucene_escape_manager
2425

2526

2627
class LuceneFieldValue(BaseQueryFieldValue):
27-
28-
def apply_value(self, value: Union[str, int]):
29-
return value
28+
escape_manager = lucene_escape_manager
3029

3130
def equal_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
3231
if isinstance(value, list):
33-
values = self.or_token.join(self.apply_value(f'{v}') for v in value)
32+
values = self.or_token.join(f'{self.apply_value(v)}' for v in value)
3433
return f"{field}:({values})"
3534
return f'{field}:{self.apply_value(value)}'
3635

@@ -48,29 +47,29 @@ def greater_or_equal_modifier(self, field: str, value: Union[int, str]) -> str:
4847

4948
def not_equal_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
5049
if isinstance(value, list):
51-
values = self.or_token.join(self.apply_value(f'{v}') for v in value)
50+
values = self.or_token.join(f'{self.apply_value(v)}' for v in value)
5251
return f"NOT ({field} = ({values})"
5352
return f'NOT ({field} = {self.apply_value(value)})'
5453

5554
def contains_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
5655
if isinstance(value, list):
57-
values = self.or_token.join(self.apply_value(f'*{v}*') for v in value)
56+
values = self.or_token.join(f'*{self.apply_value(v)}*' for v in value)
5857
return f"{field}:({values})"
59-
prepared_value = self.apply_value(f"*{value}*")
58+
prepared_value = f"*{self.apply_value(value)}*"
6059
return f'{field}:{prepared_value}'
6160

6261
def endswith_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
6362
if isinstance(value, list):
64-
values = self.or_token.join(self.apply_value(f'*{v}') for v in value)
63+
values = self.or_token.join(f'*{self.apply_value(v)}' for v in value)
6564
return f"{field}:({values})"
66-
prepared_value = self.apply_value(f"*{value}")
65+
prepared_value = f"*{self.apply_value(value)}"
6766
return f'{field}:{prepared_value}'
6867

6968
def startswith_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
7069
if isinstance(value, list):
71-
values = self.or_token.join(self.apply_value(f'{v}*') for v in value)
70+
values = self.or_token.join(f'{self.apply_value(v)}*' for v in value)
7271
return f"{field}:({values})"
73-
prepared_value = self.apply_value(f"{value}*")
72+
prepared_value = f"{self.apply_value(value)}*"
7473
return f'{field}:{prepared_value}'
7574

7675
def regex_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
@@ -81,7 +80,7 @@ def regex_modifier(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
8180
def keywords(self, field: str, value: DEFAULT_VALUE_TYPE) -> str:
8281
if isinstance(value, list):
8382
return f"({self.or_token.join(self.keywords(field=field, value=v) for v in value)})"
84-
return self.apply_value(f"*{value}*")
83+
return f"*{self.apply_value(value)}*"
8584

8685

8786
class LuceneQueryRender(BaseQueryRender):

translator/app/translator/platforms/base/lucene/tokenizer.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919

2020
from typing import Tuple, Union, List, Any
2121

22+
from app.translator.core.custom_types.values import ValueType
2223
from app.translator.core.exceptions.parser import TokenizerGeneralException
2324
from app.translator.core.mixins.logic import ANDLogicOperatorMixin
2425
from app.translator.core.models.field import Keyword, Field
2526
from app.translator.core.models.identifier import Identifier
2627
from app.translator.core.tokenizer import QueryTokenizer
2728
from app.translator.core.custom_types.tokens import OperatorType
29+
from app.translator.platforms.base.lucene.escape_manager import lucene_escape_manager
2830
from app.translator.tools.utils import get_match_group
2931

3032

@@ -41,19 +43,21 @@ class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
4143
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_]+)"
4244
match_operator_pattern = r"(?:___field___\s*(?P<match_operator>:\[\*\sTO|:\[|:<|:>|:))\s*"
4345
_num_value_pattern = r"\d+(?:\.\d+)*"
44-
num_value_pattern = fr"(?P<num_value>{_num_value_pattern})\s*"
45-
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
46-
no_quotes_value_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\\)+)\s*"
47-
re_value_pattern = r"/(?P<re_value>[:a-zA-Z\*0-9=+%#\\\-_\,\"\'\.$&^@!\(\)\{\}\[\]\s?]+)/\s*"
48-
gte_value_pattern = fr"\[\s*(?P<gte_value>{_num_value_pattern})\s+TO\s+\*\s*\]"
49-
lte_value_pattern = fr"\[\s*\*\s+TO\s+(?P<lte_value>{_num_value_pattern})\s*\]"
46+
num_value_pattern = fr"(?P<{ValueType.number_value}>{_num_value_pattern})\s*"
47+
double_quotes_value_pattern = fr'"(?P<{ValueType.double_quotes_value}>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{{\}}\s]|\\\"|\\)*)"\s*'
48+
no_quotes_value_pattern = fr"(?P<{ValueType.no_quotes_value}>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\\)+)\s*"
49+
re_value_pattern = fr"/(?P<{ValueType.regular_expression_value}>[:a-zA-Z\*0-9=+%#\\\-_\,\"\'\.$&^@!\(\)\{{\}}\[\]\s?]+)/\s*"
50+
gte_value_pattern = fr"\[\s*(?P<{ValueType.greater_than_or_equal}>{_num_value_pattern})\s+TO\s+\*\s*\]"
51+
lte_value_pattern = fr"\[\s*\*\s+TO\s+(?P<{ValueType.less_than_or_equal}>{_num_value_pattern})\s*\]"
5052
range_value_pattern = fr"{gte_value_pattern}|{lte_value_pattern}"
5153
_value_pattern = fr"{num_value_pattern}|{re_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}|{range_value_pattern}"
52-
keyword_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\(|\\\)|\\\[|\\\]|\\\{|\\\}|\\\:|\\)+)(?:\s+|\)|$)"
54+
keyword_pattern = fr"(?P<{ValueType.no_quotes_value}>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\(|\\\)|\\\[|\\\]|\\\{{|\\\}}|\\\:|\\)+)(?:\s+|\)|$)"
5355

54-
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\[\]\s]+)\)"""
56+
multi_value_pattern = fr"""\((?P<{ValueType.value}>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\[\]\s]+)\)"""
5557
multi_value_check_pattern = r"___field___\s*___operator___\s*\("
5658

59+
escape_manager = lucene_escape_manager
60+
5761
wildcard_symbol = "*"
5862

5963
@staticmethod
@@ -69,22 +73,22 @@ def clean_quotes(value: Union[str, int]):
6973
return value
7074

7175
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:
72-
if (num_value := get_match_group(match, group_name='num_value')) is not None:
76+
if (num_value := get_match_group(match, group_name=ValueType.number_value)) is not None:
7377
return operator, num_value
7478

75-
elif (re_value := get_match_group(match, group_name='re_value')) is not None:
79+
elif (re_value := get_match_group(match, group_name=ValueType.regular_expression_value)) is not None:
7680
return OperatorType.REGEX, re_value
7781

78-
elif (n_q_value := get_match_group(match, group_name='n_q_value')) is not None:
82+
elif (n_q_value := get_match_group(match, group_name=ValueType.no_quotes_value)) is not None:
7983
return operator, n_q_value
8084

81-
elif (d_q_value := get_match_group(match, group_name='d_q_value')) is not None:
85+
elif (d_q_value := get_match_group(match, group_name=ValueType.double_quotes_value)) is not None:
8286
return operator, d_q_value
8387

84-
elif (gte_value := get_match_group(match, group_name='gte_value')) is not None:
88+
elif (gte_value := get_match_group(match, group_name=ValueType.greater_than_or_equal)) is not None:
8589
return OperatorType.GTE, gte_value
8690

87-
elif (lte_value := get_match_group(match, group_name='lte_value')) is not None:
91+
elif (lte_value := get_match_group(match, group_name=ValueType.less_than_or_equal)) is not None:
8892
return OperatorType.LTE, lte_value
8993

9094
return super().get_operator_and_value(match, operator)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from app.translator.core.custom_types.values import ValueType
2+
from app.translator.core.escape_manager import EscapeManager
3+
from app.translator.core.models.escape_details import EscapeDetails
4+
5+
6+
class SplEscapeManager(EscapeManager):
7+
escape_map = {
8+
ValueType.value: EscapeDetails(pattern='("|(?<!\\\\)\\\\(?![*?\\\\]))')
9+
}
10+
11+
12+
spl_escape_manager = SplEscapeManager()

0 commit comments

Comments
 (0)