背景
- 针对 postgres 某个表里面的 labels 字段(labels 字段是一维数组类型)做复杂查询,要求支持常见的且,或,非等等功能
- 原来的业务逻辑已实现了且,或,非功能,但不支持括号来提高运算符的优先级
方案
显而易见的方案就是实现一个 python 版本的语法解析器来支持各种语法,有以下方案
综上,采用方案 1
实现
由于后端使用的 django 框架,根据数据库的不同以及 orm 的不同,操作数据库一般有 3 种形式
- 原生 postgres sql 查询
- django Q 对象查询
- 原生 mongodb sql 查询
结合业务,只需要实现 1, 2,主要功能如下
- () - , | % 分别代表左括号、右括号、非、且、或、模糊搜索,优先级 () > % > - > , > |
- 如果要搜索的 label 带有上述字符,可以用 \ 转义,不是开头的 -、% 可以不用转义
原生 postgres sql 查询
utils/pg_sql_utils.py
import re
from pydash import py_
def replace_dashes(match):
    result = match.group()
    first_char = ''
    if result.startswith('-'):
        first_char = '-'
        result = result[1:]
    return first_char + re.sub(r'(?<!\\)-', r'\-', result)
def parse_sql(common_expression, like_expression, sql):
    # () - , | 分别代表左右括号,非,且,或,优先级 () > - > , > |,如果要搜索的 label 带有上述 5 种字符,可以用 \ 转义,非开头的 - 不需要转义
    sql = sql.strip()
    if not sql:
        return ''
    # 预处理,将筛选出 - 开头的匹配项,然后对匹配项里面除了开头的 - 以外的 - 替换为 \-,最后将替换的结果拼接到原字符串
    # 就是将下面的 [^()|,-])+ 改为 [^()|,])+,在这里处理替换 - 开头的情况,即将剩余的 - 替换为 \-
    pattern = r'((?:\\[()|,-])|[^()|,])+'
    sql = re.sub(pattern, replace_dashes, sql)
    print('sql:', sql)
    import ply.lex as lex
    import ply.yacc as yacc
    # 定义词法分析器的词法规则
    tokens = (  # noqa: F841
        'LPAREN',
        'RPAREN',
        'OR',
        'AND',
        'NOT',
        'TERM',
    )
    t_LPAREN = r'\('  # noqa: F841
    t_RPAREN = r'\)'  # noqa: F841
    t_OR = r'\|'  # noqa: F841
    t_AND = r','  # noqa: F841
    t_NOT = r'-'  # noqa: F841
    def t_TERM(t):
        # 匹配以 ()|,- 的分割的连续字符,但是要忽略转义字符 \,如 \- 表示匹配 -
        r'((?:\\[()|,-])|[^()|,-])+'
        # 去掉转义字符 \
        if t.value.startswith('\%'):
            t.value = '\%' + re.sub(r'\\([()|,\-%])', r'\1', t.value[2:])
        else:
            t.value = re.sub(r'\\([()|,\-%])', r'\1', t.value)
        return t
    # 忽略空格和制表符
    t_ignore = ' \t'  # noqa: F841
    # 错误处理
    def t_error(t):
        raise TypeError("Unknown text '%s'" % (py_.get(t, 'value'),))
    # 构建词法分析器
    lexer = lex.lex()
    # 确定运算符的优先级
    precedence = (  # noqa: F841
      ('left', 'OR'),
      ('left', 'AND'),
      ('right', 'NOT')
    )
    # 定义语法分析器的语法规则
    def p_expression_group(p):
        '''expression : LPAREN expression RPAREN'''
        p[0] = f'({p[2]})'
    def p_expression_or(p):
        '''expression : expression OR expression'''
        p[0] = f'{p[1]} OR {p[3]}'
    def p_expression_and(p):
        '''expression : expression AND expression'''
        p[0] = f'{p[1]} AND {(p[3])}'
    def p_expression_not(p):
        '''expression : NOT expression'''
        p[0] = f'NOT {p[2]}'
    def p_expression_term(p):
        '''expression : TERM'''
        if p[1].startswith('%'):
            p[0] = like_expression.format(
                value=p[1][1:])
        else:
            if p[1].startswith('\%'):
                p[1] = p[1][1:]
            p[0] = common_expression.format(
                value=p[1])
    def p_error(p):
        raise SyntaxError(
            f"Syntax error in input! Text is {sql}, Token is {py_.get(p, 'value')}")
    # 构建语法分析器
    parser = yacc.yacc()
    result = parser.parse(sql, lexer=lexer)
    return result
if __name__ == '__main__':
    common_expression = "labels::TEXT[] @> ARRAY ['{value}']"
    like_expression = "labels::TEXT LIKE '%{value}%'"
    print(
        parse_sql(common_expression, like_expression, 'a|-直行路口,-free-,\,\(\)\|space专项,(b|c),(d,e),%3434,\%3434'))
    print(
        parse_sql(common_expression, like_expression, '-ego\-turn\-right,ego-turn\-right,ego-turn-right,v4.0.4-f30-ota31-pro,v4.0.4\-f30\-ota31\-pro'))sql: a|-直行路口,-free\-,\,\(\)\|space专项,(b|c),(d,e),%3434,\%3434
labels::TEXT[] @> ARRAY ['a'] OR NOT labels::TEXT[] @> ARRAY ['直行路口'] AND NOT labels::TEXT[] @> ARRAY ['free-'] AND labels::TEXT[] @> ARRAY [',()|space专项'] AND (labels::TEXT[] @> ARRAY ['b'] OR labels::TEXT[] @> ARRAY ['c']) AND (labels::TEXT[] @> ARRAY ['d'] AND labels::TEXT[] @> ARRAY ['e']) AND labels::TEXT LIKE '%3434%' AND labels::TEXT[] @> ARRAY ['%3434']
sql: -ego\-turn\-right,ego\-turn\-right,ego\-turn\-right,v4.0.4\-f30\-ota31\-pro,v4.0.4\-f30\-ota31\-pro
NOT labels::TEXT[] @> ARRAY ['ego-turn-right'] AND labels::TEXT[] @> ARRAY ['ego-turn-right'] AND labels::TEXT[] @> ARRAY ['ego-turn-right'] AND labels::TEXT[] @> ARRAY ['v4.0.4-f30-ota31-pro'] AND labels::TEXT[] @> ARRAY ['v4.0.4-f30-ota31-pro']django Q 对象查询
utils/q_sql_utils.py
import re
from pydash import py_
from django.db.models import Q
def replace_dashes(match):
    result = match.group()
    first_char = ''
    if result.startswith('-'):
        first_char = '-'
        result = result[1:]
    return first_char + re.sub(r'(?<!\\)-', r'\-', result)
def parse_q(common_expression, like_expression, sql):
    # () - , | 分别代表左右括号,非,且,或,优先级 () > - > , > |,如果要搜索的 label 带有上述 5 种字符,可以用 \ 转义,非开头的 - 不需要转义
    sql = sql.strip()
    if not sql:
        return ''
    # 预处理,将筛选出 - 开头的匹配项,然后对匹配项里面除了开头的 - 以外的 - 替换为 \-,最后将替换的结果拼接到原字符串
    # 就是将下面的 [^()|,-])+ 改为 [^()|,])+,在这里处理替换 - 开头的情况,即将剩余的 - 替换为 \-
    pattern = r'((?:\\[()|,-])|[^()|,])+'
    sql = re.sub(pattern, replace_dashes, sql)
    print('sql:', sql)
    import ply.lex as lex
    import ply.yacc as yacc
    # 定义词法分析器的词法规则
    tokens = (  # noqa: F841
        'LPAREN',
        'RPAREN',
        'OR',
        'AND',
        'NOT',
        'TERM',
    )
    t_LPAREN = r'\('  # noqa: F841
    t_RPAREN = r'\)'  # noqa: F841
    t_OR = r'\|'  # noqa: F841
    t_AND = r','  # noqa: F841
    t_NOT = r'-'  # noqa: F841
    def t_TERM(t):
        # 匹配以 ()|,- 的分割的连续字符,但是要忽略转义字符 \,如 \- 表示匹配 -
        r'((?:\\[()|,-])|[^()|,-])+'
        # 去掉转义字符 \
        if t.value.startswith('\%'):
            t.value = '\%' + re.sub(r'\\([()|,\-%])', r'\1', t.value[2:])
        else:
            t.value = re.sub(r'\\([()|,\-%])', r'\1', t.value)
        return t
    # 忽略空格和制表符
    t_ignore = ' \t'  # noqa: F841
    # 错误处理
    def t_error(t):
        raise TypeError("Unknown text '%s'" % (py_.get(t, 'value'),))
    # 构建词法分析器
    lexer = lex.lex()
    # 确定运算符的优先级
    precedence = (  # noqa: F841
      ('left', 'OR'),
      ('left', 'AND'),
      ('right', 'NOT')
    )
    # 定义语法分析器的语法规则
    def p_expression_group(p):
        '''expression : LPAREN expression RPAREN'''
        p[0] = (p[2])
    def p_expression_or(p):
        '''expression : expression OR expression'''
        p[0] = Q(p[1]) | Q(p[3])
    def p_expression_and(p):
        '''expression : expression AND expression'''
        p[0] = Q(p[1]) & Q(p[3])
    def p_expression_not(p):
        '''expression : NOT expression'''
        p[0] = ~Q(p[2])
    def p_expression_term(p):
        '''expression : TERM'''
        if p[1].startswith('%'):
            p[0] = Q(**like_expression(p[1][1:]))
        else:
            if p[1].startswith('\%'):
                p[1] = p[1][1:]
            p[0] = Q(**common_expression(p[1]))
    def p_error(p):
        raise SyntaxError(
            f"Syntax error in input! Text is {sql}, Token is {py_.get(p, 'value')}")
    # 构建语法分析器
    parser = yacc.yacc()
    result = parser.parse(sql, lexer=lexer)
    return result
if __name__ == '__main__':
    def common_expression(value): return {'labels__contains': [value]}
    def like_expression(value): return {'labels__regex': r'%s' % value}
    print(
        parse_q(common_expression, like_expression, 'a|-直行路口,-free-,\,\(\)\|space专项,(b|c),(d,e),%3434,\%3434'))
    print(
        parse_q(common_expression, like_expression, '-ego\-turn\-right,ego-turn\-right,ego-turn-right,v4.0.4-f30-ota31-pro,v4.0.4\-f30\-ota31\-pro'))sql: a|-直行路口,-free\-,\,\(\)\|space专项,(b|c),(d,e),%3434,\%3434
(OR: (AND: ('labels__contains', ['a'])), (AND: (AND: (AND: (AND: (AND: (AND: (NOT (AND: (AND: ('labels__contains', ['直行路口'])))), (NOT (AND: (AND: ('labels__contains', ['free-']))))), (AND: ('labels__contains', [',()|space专项']))), (OR: (AND: ('labels__contains', ['b'])), (AND: ('labels__contains', ['c'])))), (AND: (AND: ('labels__contains', ['d'])), (AND: ('labels__contains', ['e'])))), (AND: ('labels__regex', '3434'))), (AND: ('labels__contains', ['%3434']))))
sql: -ego\-turn\-right,ego\-turn\-right,ego\-turn\-right,v4.0.4\-f30\-ota31\-pro,v4.0.4\-f30\-ota31\-pro
(AND: (AND: (AND: (AND: (NOT (AND: (AND: ('labels__contains', ['ego-turn-right'])))), (AND: ('labels__contains', ['ego-turn-right']))), (AND: ('labels__contains', ['ego-turn-right']))), (AND: ('labels__contains', ['v4.0.4-f30-ota31-pro']))), (AND: ('labels__contains', ['v4.0.4-f30-ota31-pro'])))