Source code for delphin.tsql


"""
TSQL -- Test Suite Query Language

This module implements a subset of TSQL, namely the 'select' (or
'retrieve') queries for extracting data from test suites. The general
form of a select query is::

    [select] <projection> [from <tables>] [where <condition>]*

For example, the following selects item identifiers that took more
than half a second to parse::

    select i-id from item where total > 500

The `select` string is necessary when querying with the generic
:func:`query` function, but is implied and thus disallowed when using
the :func:`select` function.

The `<projection>` is a list of space-separated field names (e.g.,
`i-id i-input mrs`), or the special string `*` which selects all
columns from the joined tables.

The optional `from` clause provides a list of table names (e.g.,
`item parse result`) that are joined on shared keys. The `from`
clause is required when `*` is used for the projection, but it can
also be used to select columns from non-standard tables (e.g., `i-id
from output`). Alternatively, `delphin.itsdb`-style data specifiers
(see :func:`delphin.itsdb.get_data_specifier`) may be used to specify
the table on the column name (e.g., `item:i-id`).

The `where` clause provide conditions for filtering the list of
results. Conditions are binary operations that take a column or data
specifier on the left side and an integer (e.g., `10`), a date (e.g.,
`2018-10-07`), or a string (e.g., `"sleep"`) on the right side of the
operator. The allowed conditions are:

    ================  ======================================
    Condition         Form
    ================  ======================================
    Regex match       ``<field> ~ "regex"``
    Regex fail        ``<field> !~ "regex"``
    Equality          ``<field> = (integer|date|"string")``
    Inequality        ``<field> != (integer|date|"string")``
    Less-than         ``<field> < (integer|date)``
    Less-or-equal     ``<field> <= (integer|date)``
    Greater-than      ``<field> > (integer|date)``
    Greater-or-equal  ``<field> >= (integer|date)``
    ================  ======================================

Boolean operators can be used to join multiple conditions or for
negation:

    ===========  =====================================
    Operation    Form
    ===========  =====================================
    Disjunction  ``X | Y``, ``X || Y``, or ``X or Y``
    Conjunction  ``X & Y``, ``X && Y``, or ``X and Y``
    Negation     ``!X`` or ``not X``
    ===========  =====================================

Normally, disjunction scopes over conjunction, but parentheses may be
used to group clauses, so the following are equivalent::

    ... where i-id = 10 or i-id = 20 and i-input ~ "[Dd]og"
    ... where i-id = 10 or (i-id = 20 and i-input ~ "[Dd]og")

Multiple `where` clauses may also be used as a conjunction that scopes
over disjunction, so the following are equivalent::

    ... where (i-id = 10 or i-id = 20) and i-input ~ "[Dd]og"
    ... where i-id = 10 or i-id = 20 where i-input ~ "[Dd]og"

This facilitates query construction, where a user may want to apply
additional global constraints by appending new conditions to the query
string.

PyDelphin has several differences to standard TSQL:

* `select *` requires a `from` clause
* `select * from item result` does not also include columns from the
  intervening `parse` table
* `select i-input from result` returns a matching `i-input` for every
  row in `result`, rather than only the unique rows

PyDelphin also adds some features to standard TSQL:

* optional table specifications on columns (e.g., `item:i-id`)
* multiple `where` clauses (as described above)
"""

import operator
import copy
import re

from delphin.exceptions import TSQLSyntaxError
from delphin.util import LookaheadIterator, parse_datetime
from delphin import itsdb


### QUERY INSPECTION ##########################################################

[docs]def inspect_query(query):
    """
    Parse *query* and return the interpreted query object.

    Example:
        >>> from delphin import tsql
        >>> from pprint import pprint
        >>> pprint(tsql.inspect_query(
        ...     'select i-input from item where i-id < 100'))
        {'querytype': 'select',
         'projection': ['i-input'],
         'tables': ['item'],
         'where': ('<', ('i-id', 100))}
    """
    return _parse_query(query)

### QUERY PROCESSING ##########################################################

[docs]def query(query, ts, **kwargs):
    """
    Perform *query* on the testsuite *ts*.

    Note: currently only 'select' queries are supported.

    Args:
        query (str): TSQL query string
        ts (:class:`delphin.itsdb.TestSuite`): testsuite to query over
        kwargs: keyword arguments passed to the more specific query
            function (e.g., :func:`select`)
    Example:
        >>> list(tsql.query('select i-id where i-length < 4', ts))
        [[142], [1061]]
    """
    queryobj = _parse_query(query)

    if queryobj['querytype'] in ('select', 'retrieve'):
        return _select(
            queryobj['projection'],
            queryobj['tables'],
            queryobj['where'],
            ts,
            mode=kwargs.get('mode', 'list'),
            cast=kwargs.get('cast', True))
    else:
        # not really a syntax error; replace with TSQLError or something
        # when the proper exception class exists
        raise TSQLSyntaxError(queryobj['querytype'] +
                              ' queries are not supported')


[docs]def select(query, ts, mode='list', cast=True):
    """
    Perform the TSQL selection query *query* on testsuite *ts*.

    Note: The `select`/`retrieve` part of the query is not included.

    Args:
        query (str): TSQL select query
        ts (:class:`delphin.itsdb.TestSuite`): testsuite to query over
        mode (str): how to return the results (see
            :func:`delphin.itsdb.select_rows` for more information
            about the *mode* parameter; default: `list`)
        cast (bool): if `True`, values will be cast to their datatype
            according to the testsuite's relations (default: `True`)
    Example:
        >>> list(tsql.select('i-id where i-length < 4', ts))
        [[142], [1061]]
    """
    queryobj = _parse_select(query)
    return _select(
        queryobj['projection'],
        queryobj['tables'],
        queryobj['where'],
        ts,
        mode,
        cast)


def _select(projection, tables, condition, ts, mode, cast):
    table = _select_from(tables, None, ts)
    table = _select_projection(projection, table, ts)
    table = _select_where(condition, table, ts)

    # finally select the relevant columns from the joined table
    if projection == '*':
        if len(tables) == 1:
            projection = [f.name for f in ts.relations[tables[0]]]
        else:
            projection = []
            for t in tables:
                projection.extend(t + ':' + f.name
                                  for f in ts.relations[t])
    return itsdb.select_rows(projection, table, mode=mode, cast=cast)


def _select_from(tables, table, ts):
    joined = set([] if table is None else table.name.split('+'))
    for tab in tables:
        if tab not in joined:
            joined.add(tab)
            table = _transitive_join(table, ts[tab], ts, 'inner')
    return table


def _select_projection(projection, table, ts):
    if projection != '*':
        for p in projection:
            table = _join_if_missing(table, p, ts, 'inner')
    return table


def _select_where(condition, table, ts):
    keys = table.fields.keys()
    ids = set()
    if condition is not None:
        func, fields = _process_condition(condition)
        # join tables in the condition for filtering
        tmptable = table
        for field in fields:
            tmptable = _join_if_missing(tmptable, field, ts, 'left')
        # filter the rows and store the keys only
        for record in filter(func, tmptable):
            idtuple = tuple(record[key] for key in keys)
            ids.add(idtuple)
        # check if a matching idtuple was retained
        def meta_condition(rec):
            return tuple(rec[key] for key in keys) in ids
        table[:] = filter(meta_condition, table)
    return table


_operator_functions = {'==': operator.eq,
                       '!=': operator.ne,
                       '<': operator.lt,
                       '<=': operator.le,
                       '>': operator.gt,
                       '>=': operator.ge}


def _process_condition(condition):
    # conditions are something like:
    #  ('==', ('i-id', 11))
    op, body = condition
    if op in ('and', 'or'):
        fields = []
        conditions = []
        for cond in body:
            _func, _fields = _process_condition(cond)
            fields.extend(_fields)
            conditions.append(_func)
        _func = all if op == 'and' else any
        def func(row):
            return _func(cond(row) for cond in conditions)
    elif op == 'not':
        nfunc, fields = _process_condition(body)
        func = lambda row, nfunc=nfunc: not nfunc(row)
    elif op == '~':
        fields = [body[0]]
        func = lambda row, body=body: re.search(body[1], row[body[0]])
    elif op == '!~':
        fields = [body[0]]
        func = lambda row, body=body: not re.search(body[1], row[body[0]])
    else:
        fields = [body[0]]
        compare = _operator_functions[op]
        def func(row):
            return compare(row.get(body[0], cast=True), body[1])
    return func, fields


def _join_if_missing(table, col, ts, how):
    tab, _, column = col.rpartition(':')
    if not tab:
        # Just get the first table defining the column. This
        # makes the assumption that relations are ordered and
        # that the first one is 'primary'
        tab = ts.relations.find(column)[0]
    if table is None or column not in table.fields:
        table = _transitive_join(table, ts[tab], ts, how)
    return table


def _transitive_join(tab1, tab2, ts, how):
    if tab1 is None:
        table = itsdb.Table(tab2.fields, list(tab2))
    else:
        table = tab1
        # the tables may not be directly joinable but could be
        # joinable transitively via a 'path' of table joins
        path = ts.relations.path(tab1.name, tab2.name)
        for intervening, pivot in path:
            table = itsdb.join(table, ts[intervening], on=pivot, how=how)
    return table


### QUERY PARSING #############################################################

_keywords = list(map(re.escape,
                     ('info', 'set', 'retrieve', 'select', 'insert',
                      'from', 'where', 'report', '*', '.')))
_operators = list(map(re.escape,
                      ('==', '=', '!=', '~', '!~', '<=', '<', '>=', '>',
                       '&&', '&', 'and', '||', '|', 'or', '!', 'not')))

_tsql_lex_re = re.compile(
    r'''# regex-pattern                      gid  description
    ({keywords})                           #   1  keywords
    |({operators})                         #   2  operators
    |(\(|\))                               #   3  parentheses
    |"([^"\\]*(?:\\.[^"\\]*)*)"            #   4  double-quoted "strings"
    |'([^'\\]*(?:\\.[^'\\]*)*)'            #   5  single-quoted 'strings'
    |({yyyy}-{m}(?:-{d})?(?:{t}|{tt})?)    #   6  yyyy-mm-dd date
    |((?:{d}-)?{m}-{yy}(?:{t}|{tt})?)      #   7  dd-mm-yy date
    |(:today|now)                          #   8  keyword date
    |([+-]?\d+)                            #   9  integers
    |((?:{id}:)?{id}(?:@(?:{id}:)?{id})*)  #  10  identifier (extended def)
    |([^\s])                               #  11  unexpected
    '''.format(keywords='|'.join(_keywords),
               operators='|'.join(_operators),
               d=r'[0-9]{1,2}',
               m=(r'(?:[0-9]{1,2}|'
                  r'jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'),
               yy=r'(?:[0-9]{2})?[0-9]{2}',
               yyyy=r'[0-9]{4}',
               t=r'\s*\([0-9]{2}:[0-9]{2}(?::[0-9]{2})?\)',
               tt=r'\s+[0-9]{2}:[0-9]{2}(?::[0-9]{2})',
               id=r'[a-zA-Z][-_a-zA-Z0-9]*'),
    flags=re.VERBOSE|re.IGNORECASE)


def _lex(s):
    """
    Lex the input string according to _tsql_lex_re.

    Yields
        (gid, token, line_number)
    """
    s += '.'  # make sure there's a terminator to know when to stop parsing
    lines = enumerate(s.splitlines(), 1)
    lineno = pos = 0
    try:
        for lineno, line in lines:
            matches = _tsql_lex_re.finditer(line)
            for m in matches:
                gid = m.lastindex
                if gid == 11:
                    raise TSQLSyntaxError('unexpected input',
                                          lineno=lineno,
                                          offset=m.start(),
                                          text=line)
                else:
                    token = m.group(gid)
                    yield (gid, token, lineno)
    except StopIteration:
        pass


def _parse_query(query):
    querytype, _, querybody = query.lstrip().partition(' ')
    querytype = querytype.lower()
    if querytype in ('select', 'retrieve'):
        result = _parse_select(querybody)
    else:
        raise TSQLSyntaxError("'{}' queries are not supported"
                              .format(querytype), lineno=1)

    return result


def _parse_select(query):
    tokens = LookaheadIterator(_lex(query))
    _, token, lineno = tokens.peek()  # maybe used in error below

    projection = _parse_select_projection(tokens)
    tables = _parse_select_from(tokens)
    condition = _parse_select_where(tokens)

    if projection == '*' and not tables:
        raise TSQLSyntaxError(
            "'select *' requires a 'from' clause",
            lineno=lineno, text=token)

    # verify we're at the end of the query (the '.' may have been
    # added in _lex())
    gid, token, lineno = tokens.next()
    _expect(gid == 1 and token == '.', token, lineno, "'.'")

    return {'querytype': 'select',
            'projection': projection,
            'tables': tables,
            'where': condition}


def _parse_select_projection(tokens):
    gid, token, lineno = tokens.next()
    if token == '*':
        projection = token
    elif gid == 10:
        projection = [token]
        while tokens.peek()[0] == 10:
            _, col, _ = tokens.next()
            projection.append(col)
        projection = _prepare_columns(projection)
    else:
        raise TSQLSyntaxError("expected '*' or column identifiers",
                              lineno=lineno, text=token)
    return projection


def _prepare_columns(cols):
    columns = []
    for col in cols:
        table = ''
        for part in col.split('@'):
            tblname, _, colname = part.rpartition(':')
            if tblname:
                table = tblname + ':'
            columns.append(table + colname)
    return columns


def _parse_select_from(tokens):
    tables = []
    if tokens.peek()[1] == 'from':
        tokens.next()
        while tokens.peek()[0] == 10:
            _, table, _ = tokens.next()
            tables.append(table)
    return tables


def _parse_select_where(tokens):
    conditions = []
    while tokens.peek()[1] == 'where':
        tokens.next()
        conditions.append(_parse_condition_disjunction(tokens))
    if len(conditions) == 1:
        condition = conditions[0]
    elif len(conditions) > 1:
        condition = ('and', conditions)
    else:
        condition = None
    return condition


def _parse_condition_disjunction(tokens):
    conds = []
    while True:
        cond = _parse_condition_conjunction(tokens)
        if cond is not None:
            conds.append(cond)
        if tokens.peek()[1] in ('|', '||', 'or'):
            tokens.next()
            nextgid, nexttoken, nextlineno = tokens.peek()
        else:
            break
    if len(conds) == 0:
        return None
    elif len(conds) == 1:
        return conds[0]
    else:
        return ('or', tuple(conds))


def _parse_condition_conjunction(tokens):
    conds = []
    nextgid, nexttoken, nextlineno = tokens.peek()
    while True:
        if nextgid == 2 and nexttoken.lower() in ('!', 'not'):
            cond = _parse_condition_negation(tokens)
        elif nextgid == 3 and nexttoken == '(':
            cond = _parse_condition_group(tokens)
        elif nextgid == 3 and nexttoken == ')':
            break
        elif nextgid == 10:
            cond = _parse_condition_statement(tokens)
        else:
            raise TSQLSyntaxError("expected '!', 'not', '(', or a column name",
                                  lineno=nextlineno, text=nexttoken)
        conds.append(cond)
        if tokens.peek()[1].lower() in ('&', '&&', 'and'):
            tokens.next()
            nextgid, nexttoken, nextlineno = tokens.peek()
        else:
            break

    if len(conds) == 0:
        return None
    elif len(conds) == 1:
        return conds[0]
    else:
        return ('and', tuple(conds))


def _parse_condition_negation(tokens):
    gid, token, lineno = tokens.next()
    _expect(gid == 2 and token in ('!', 'not'), token, lineno, "'!' or 'not'")
    cond = _parse_condition_disjunction(tokens)
    return ('not', cond)


def _parse_condition_group(tokens):
    gid, token, lineno = tokens.next()
    _expect(gid == 3 and token == '(', token, lineno, "'('")
    cond = _parse_condition_disjunction(tokens)
    gid, token, lineno = tokens.next()
    _expect(gid == 3 and token == ')', token, lineno, "')'")
    return tuple(cond)


def _parse_condition_statement(tokens):
    gid, column, lineno = tokens.next()
    _expect(gid == 10, column, lineno, 'a column name')
    gid, op, lineno = tokens.next()
    _expect(gid == 2, op, lineno, 'an operator')
    if op == '=':
        op = '=='  # normalize = to == (I think these are equivalent)
    gid, value, lineno = tokens.next()
    if op in ('~', '!~') and gid not in (4, 5):
        raise TSQLSyntaxError(
            "the '{}' operator is only valid with strings".format(op),
            lineno=lineno, text=op)
    elif op in ('<', '<=', '>', '>=') and gid not in (6, 7, 8, 9):
        raise TSQLSyntaxError(
            "the '{}' operator is only valid with integers and dates"
            .format(op), lineno=lineno, text=op)
    else:
        if gid in (6, 7, 8):
            value = parse_datetime(value)
        elif gid == 9:
            value = int(value)
        return (op, (column, value))


def _expect(expected, token, lineno, msg):
    msg = 'expected ' + msg
    if not expected:
        raise TSQLSyntaxError(msg, lineno=lineno, text=token)