@@ -43,35 +43,34 @@ def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
4343
4444
4545class QueryTokenizer (BaseTokenizer ):
46- field_pattern = r"(?P<field_name>[a-zA-Z\._\-]+)"
47- operator_pattern = r"\s?(?P<operator>and|or|not|AND|OR|NOT)\s?"
48- field_value_pattern = r"""^___field___\s*___match_operator___\s*___value___"""
49- match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>ilike|contains|endswith|startswith|in|>=|<=|==|>|<|=~|!=|=|:|\:))\s?"""
46+ single_value_operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
47+ multi_value_operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
48+ operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
49+
50+ logical_operator_pattern = r"\s?(?P<logical_operator>and|or|not|AND|OR|NOT)\s?"
51+ field_value_pattern = r"""^___field___\s*___operator___\s*___value___"""
5052 base_value_pattern = r"(?:___value_pattern___)"
51- _value_pattern = r"""(?:\"|\')*(?P<value>[:a-zA-Z\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)(?:\*|\'|\"|\s|\$)*"""
52- value_pattern = base_value_pattern .replace ('___value_pattern___' , _value_pattern )
53- multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)\)"""
54- keyword_pattern = None # do not modify, use subclasses to define this attribute
5553
56- multi_value_operators = tuple ()
54+ # do not modify, use subclasses to define this attribute
55+ field_pattern : str = None
56+ _value_pattern : str = None
57+ value_pattern : str = None
58+ multi_value_pattern : str = None
59+ keyword_pattern : str = None
60+
5761 multi_value_delimiter = ","
5862 wildcard_symbol = None
5963
60- operators_map = {
61- "=" : OperatorType .EQ ,
62- "in" : OperatorType .EQ ,
63- "<" : OperatorType .LT ,
64- "<=" : OperatorType .LTE ,
65- ">" : OperatorType .GT ,
66- ">=" : OperatorType .GTE ,
67- "!=" : OperatorType .NEQ ,
68- "contains" : OperatorType .CONTAINS ,
69- "startswith" : OperatorType .STARTSWITH ,
70- "endswith" : OperatorType .ENDSWITH
71- }
72-
7364 def __init_subclass__ (cls , ** kwargs ):
65+ cls ._validate_re_patterns ()
7466 cls .value_pattern = cls .base_value_pattern .replace ('___value_pattern___' , cls ._value_pattern )
67+ cls .operators_map = {** cls .single_value_operators_map , ** cls .multi_value_operators_map }
68+ cls .operator_pattern = fr"""(?:___field___\s*(?P<operator>(?:{ '|' .join (cls .operators_map )} )))\s*"""
69+
70+ @classmethod
71+ def _validate_re_patterns (cls ):
72+ if not all ([cls .field_pattern , cls ._value_pattern ]):
73+ raise ValueError (f"{ cls .__name__ } re patterns must be set" )
7574
7675 def map_operator (self , operator : str ) -> str :
7776 try :
@@ -89,16 +88,16 @@ def search_field(self, query):
8988 def escape_field_name (self , field_name ):
9089 return field_name .replace ("." , r"\." )
9190
92- def search_match_operator (self , query , field_name ) -> str :
91+ def search_operator (self , query , field_name ) -> str :
9392 field_name = self .escape_field_name (field_name )
94- match_operator_pattern = self .match_operator_pattern .replace ("___field___" , field_name )
95- match_operator_regex = re .compile (match_operator_pattern , re .IGNORECASE )
96- match_operator_search = re .search (match_operator_regex , query )
97- if match_operator_search is None :
93+ operator_pattern = self .operator_pattern .replace ("___field___" , field_name )
94+ compiled_operator_regex = re .compile (operator_pattern , re .IGNORECASE )
95+ if (operator_search := re .search (compiled_operator_regex , query )) is None :
9896 raise TokenizerGeneralException (error = f"Operator couldn't be found in query part: { query } " )
99- match_operator = match_operator_search .group ("match_operator" )
100- match_operator = match_operator .strip (" " )
101- return match_operator
97+
98+ operator = operator_search .group ("operator" )
99+ operator = operator .strip (" " )
100+ return operator
102101
103102 def get_operator_and_value (self , match : re .Match , operator : str = OperatorType .EQ ) -> Tuple [str , Any ]:
104103 return operator , get_match_group (match , group_name = 'value' )
@@ -118,7 +117,7 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
118117 field_value_pattern = self .get_field_value_pattern (operator , field_name )
119118 value_pattern = self .value_pattern
120119 is_multi = False
121- if operator .lower () in self .multi_value_operators :
120+ if operator .lower () in self .multi_value_operators_map :
122121 value_pattern = self .multi_value_pattern
123122 is_multi = True
124123
@@ -142,7 +141,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
142141
143142 def get_field_value_pattern (self , operator , field_name ):
144143 field_value_pattern = self .field_value_pattern .replace ("___field___" , self .escape_field_name (field_name ))
145- return field_value_pattern .replace ("___match_operator___ " , operator )
144+ return field_value_pattern .replace ("___operator___ " , operator )
146145
147146 @staticmethod
148147 def _clean_value (value : str , wildcard_symbol : str ) -> str :
@@ -183,28 +182,45 @@ def create_field(field_name: str, operator: Identifier, value: Union[str, List])
183182
184183 def search_field_value (self , query ):
185184 field_name = self .search_field (query )
186- operator = self .search_match_operator (query , field_name )
185+ operator = self .search_operator (query , field_name )
187186 query , operator , value = self .search_value (query = query , operator = operator , field_name = field_name )
188187 value , operator_token = self .process_value_wildcard_symbols (value = value ,
189188 operator = operator ,
190189 wildcard_symbol = self .wildcard_symbol )
191190 field = self .create_field (field_name = field_name , operator = operator_token , value = value )
192191 return field , query
193192
194- def __get_identifier (self , query : str ) -> Tuple [Union [Field , Keyword , Identifier ], str ]:
193+ def _match_field_value (self , query : str , white_space_pattern : str = r"\s+" ) -> bool :
194+ single_value_operator_group = fr"(?:{ '|' .join (self .single_value_operators_map )} )"
195+ single_value_pattern = fr"""{ self .field_pattern } \s*{ single_value_operator_group } \s*{ self .value_pattern } \s*"""
196+ if re .match (single_value_pattern , query , re .IGNORECASE ):
197+ return True
198+
199+ if self .multi_value_operators_map :
200+ multi_value_operator_group = fr"(?:{ '|' .join (self .multi_value_operators_map )} )"
201+ pattern = f"{ self .field_pattern } { white_space_pattern } { multi_value_operator_group } { white_space_pattern } "
202+ multi_value_pattern = fr"{ pattern } { self .multi_value_pattern } "
203+ if re .match (multi_value_pattern , query , re .IGNORECASE ):
204+ return True
205+
206+ return False
207+
208+ def _get_identifier (self , query : str ) -> Tuple [Union [Field , Keyword , Identifier ], str ]:
195209 query = query .strip ("\n " ).strip (" " ).strip ("\n " )
196210 if query .startswith (GroupType .L_PAREN ):
197211 return Identifier (token_type = GroupType .L_PAREN ), query [1 :]
198212 elif query .startswith (GroupType .R_PAREN ):
199213 return Identifier (token_type = GroupType .R_PAREN ), query [1 :]
200- elif operator_search := re .match (self .operator_pattern , query ):
201- operator = operator_search .group ("operator" )
202- pos = operator_search .end ()
203- return Identifier (token_type = operator .lower ()), query [pos :]
214+ elif logical_operator_search := re .match (self .logical_operator_pattern , query ):
215+ logical_operator = logical_operator_search .group ("logical_operator" )
216+ pos = logical_operator_search .end ()
217+ return Identifier (token_type = logical_operator .lower ()), query [pos :]
218+ elif self ._match_field_value (query ):
219+ return self .search_field_value (query )
204220 elif self .keyword_pattern and re .match (self .keyword_pattern , query ):
205221 return self .search_keyword (query )
206- else :
207- return self . search_field_value ( query )
222+
223+ raise TokenizerGeneralException ( "Unsupported query entry" )
208224
209225 @staticmethod
210226 def _validate_parentheses (tokens ):
@@ -224,7 +240,7 @@ def _validate_parentheses(tokens):
224240 def tokenize (self , query : str ) -> List [Union [Field , Keyword , Identifier ]]:
225241 tokenized = []
226242 while query :
227- identifier , query = self .__get_identifier (query = query )
243+ identifier , query = self ._get_identifier (query = query )
228244 tokenized .append (identifier )
229245 self ._validate_parentheses (tokenized )
230246 return tokenized
0 commit comments