Source code for delphin.commands


"""
PyDelphin API counterparts to the `delphin` commands.

The public functions in this module largely mirror the front-end
subcommands provided by the `delphin` command, with some small changes
to argument names or values to be better-suited to being called from
within Python.
"""

import sys
import os
import io
import json
from functools import partial
import logging

from delphin import itsdb, tsql
from delphin.mrs import xmrs
from delphin.util import safe_int, SExpr
from delphin.exceptions import PyDelphinException


logging.basicConfig()

###############################################################################
### CONVERT ###################################################################

[docs]def convert(path, source_fmt, target_fmt, select='result:mrs',
            properties=True, show_status=False, predicate_modifiers=False,
            color=False, pretty_print=False, indent=None):
    """
    Convert between various DELPH-IN Semantics representations.

    Args:
        path (str, file): filename, testsuite directory, open file, or
            stream of input representations
        source_fmt (str): convert from this format
        target_fmt (str): convert to this format
        select (str): TSQL query for selecting data (ignored if *path*
            is not a testsuite directory; default: `"result:mrs"`)
        properties (bool): include morphosemantic properties if `True`
            (default: `True`)
        show_status (bool): show disconnected EDS nodes (ignored if
            *target_fmt* is not `"eds"`; default: `False`)
        predicate_modifiers (bool): apply EDS predicate modification
            for certain kinds of patterns (ignored if *target_fmt* is
            not an EDS format; default: `False`)
        color (bool): apply syntax highlighting if `True` and
            *target_fmt* is `"simplemrs"` (default: `False`)
        pretty_print (bool): if `True`, format the output with
            newlines and default indentation (default: `False`)
        indent (int, optional): specifies an explicit number of spaces
            for indentation (implies *pretty_print*)
    Returns:
        str: the converted representation
    """
    if source_fmt.startswith('eds') and not target_fmt.startswith('eds'):
        raise ValueError(
            'Conversion from EDS to non-EDS currently not supported.')

    if indent:
        pretty_print = True
        indent = 4 if indent is True else safe_int(indent)

    if len(tsql.inspect_query('select ' + select)['projection']) != 1:
        raise ValueError('Exactly 1 column must be given in selection query: '
                         '(e.g., result:mrs)')

    # read
    loads = _get_codec(source_fmt)
    if path is None:
        xs = loads(sys.stdin.read())
    elif hasattr(path, 'read'):
        xs = loads(path.read())
    elif os.path.isdir(path):
        ts = itsdb.TestSuite(path)
        xs = [
            next(iter(loads(r[0])), None)
            for r in tsql.select(select, ts)
        ]
    else:
        xs = loads(open(path, 'r').read())

    # write
    dumps = _get_codec(target_fmt, load=False)
    kwargs = {}
    if color: kwargs['color'] = color
    if pretty_print: kwargs['pretty_print'] = pretty_print
    if indent: kwargs['indent'] = indent
    if target_fmt == 'eds':
        kwargs['pretty_print'] = pretty_print
        kwargs['show_status'] = show_status
    if target_fmt.startswith('eds'):
        kwargs['predicate_modifiers'] = predicate_modifiers
    kwargs['properties'] = properties

    # this is not a great way to improve robustness when converting
    # many representations, but it'll do until v1.0.0. Also, it only
    # improves robustness on the output, not the input.
    # Note that all the code below is to replace the following:
    #     return dumps(xs, **kwargs)
    head, joiner, tail = _get_output_details(target_fmt)
    parts = []
    if pretty_print:
        joiner = joiner.strip() + '\n'
    def _trim(s):
        if head and s.startswith(head):
            s = s[len(head):].lstrip('\n')
        if tail and s.endswith(tail):
            s = s[:-len(tail)].rstrip('\n')
        return s
    for x in xs:
        try:
            s = dumps([x], **kwargs)
        except (PyDelphinException, KeyError, IndexError):
            logging.exception('could not convert representation')
        else:
            s = _trim(s)
            parts.append(s)
    # set these after so head and tail are used correctly in _trim
    if pretty_print:
        if head:
            head += '\n'
        if tail:
            tail = '\n' + tail
    return head + joiner.join(parts) + tail


def _get_codec(codec, load=True):
    if codec == 'simplemrs':
        from delphin.mrs import simplemrs
        return simplemrs.loads if load else simplemrs.dumps

    elif codec == 'ace' and load:
        return _read_ace_parse

    elif codec == 'mrx':
        from delphin.mrs import mrx
        return mrx.loads if load else mrx.dumps

    elif codec == 'mrs-prolog' and not load:
        from delphin.mrs import prolog
        return prolog.dumps

    elif codec == 'dmrx':
        from delphin.mrs import dmrx
        return dmrx.loads if load else dmrx.dumps

    elif codec == 'simpledmrs' and not load:
        from delphin.mrs import simpledmrs
        return simpledmrs.dumps

    elif codec == 'dmrs-tikz' and not load:
        from delphin.extra import latex
        return latex.dmrs_tikz_dependency

    elif codec in ('mrs-json', 'dmrs-json', 'eds-json'):
        cls = {'mrs-json': _MRS_JSON,
               'dmrs-json': _DMRS_JSON,
               'eds-json': _EDS_JSON}[codec]
        return cls().loads if load else cls().dumps

    elif codec in ('dmrs-penman', 'eds-penman'):
        if codec == 'dmrs-penman':
            model = xmrs.Dmrs
        elif codec == 'eds-penman':
            from delphin.mrs.eds import Eds as model
        func = _penman_loads if load else _penman_dumps
        return partial(func, model=model)

    elif codec == 'eds':
        from delphin.mrs import eds
        return eds.loads if load else eds.dumps

    elif load:
        raise ValueError('invalid source format: ' + codec)
    else:
        raise ValueError('invalid target format: ' + codec)


def _get_output_details(codec):
    if codec == 'mrx':
        return ('<mrs-list', '', '</mrs-list>')

    elif codec == 'dmrx':
        from delphin.mrs import dmrx
        return ('<dmrs-list>', '', '</dmrs-list>')

    elif codec in ('mrs-json', 'dmrs-json', 'eds-json'):
        return ('[', ',', ']')

    else:
        return ('', ' ', '')


# simulate json codecs for MRS and DMRS

class _MRS_JSON(object):
    CLS = xmrs.Mrs

    def getlist(self, o):
        if isinstance(o, dict):
            return [o]
        else:
            return o

    def load(self, f):
        return [self.CLS.from_dict(d) for d in self.getlist(json.load(f))]

    def loads(self, s):
        return [self.CLS.from_dict(d) for d in self.getlist(json.loads(s))]

    def dumps(self,
              xs,
              properties=True,
              pretty_print=False,
              indent=None,
              **kwargs):
        if pretty_print and indent is None:
            indent = 2
        return json.dumps(
            [
                self.CLS.to_dict(
                    (x if isinstance(x, self.CLS)
                       else self.CLS.from_xmrs(x, **kwargs)),
                    properties=properties) for x in xs
            ],
            indent=indent)


class _DMRS_JSON(_MRS_JSON):
    CLS = xmrs.Dmrs


class _EDS_JSON(_MRS_JSON):
    from delphin.mrs import eds
    CLS = eds.Eds


# load Penman module on demand

def _penman_loads(s, model=None, **kwargs):
    from delphin.mrs import penman
    return penman.loads(s, model=model, **kwargs)

def _penman_dumps(xs, model=None, **kwargs):
    from delphin.mrs import penman
    strings = []
    for x in xs:
        try:
            strings.append(penman.dumps([x], model=model, **kwargs))
        except penman.penman.EncodeError:
            logging.error('Invalid graph; possibly disconnected')
            strings.append('')
    return '\n'.join(strings)

# read simplemrs from ACE output

def _read_ace_parse(s):
    from delphin.mrs import simplemrs
    if hasattr(s, 'decode'):
        s = s.decode('utf-8')
    surface = None
    newline = False
    for line in s.splitlines():
        if line.startswith('SENT: '):
            surface = line[6:]
        # regular ACE output
        elif line.startswith('['):
            m = line.partition(' ;  ')[0].strip()
            m = simplemrs.loads(m, single=True)
            m.surface = surface
            yield m
        # with --tsdb-stdout
        elif line.startswith('('):
            while line:
                expr = SExpr.parse(line)
                line = expr.remainder.lstrip()
                if len(expr.data) == 2 and expr.data[0] == ':results':
                    for result in expr.data[1]:
                        for key, val in result:
                            if key == ':mrs':
                                yield simplemrs.loads(val, single=True)
        elif line == '\n':
            if newline:
                surface = None
                newline = False
            else:
                newline = True
        else:
            pass


###############################################################################
### SELECT ####################################################################

[docs]def select(dataspec, testsuite, mode='list', cast=True):
    """
    Select data from [incr tsdb()] profiles.

    Args:
        query (str): TSQL select query (e.g., `'i-id i-input mrs'` or
            `'* from item where readings > 0'`)
        testsuite (str, TestSuite): testsuite or path to testsuite
            containing data to select
        mode (str): see :func:`delphin.itsdb.select_rows` for a
            description of the *mode* parameter (default: `list`)
        cast (bool): if `True`, cast column values to their datatype
            according to the relations file (default: `True`)
    Returns:
        a generator that yields selected data
    """
    if isinstance(testsuite, itsdb.ItsdbProfile):
        testsuite = itsdb.TestSuite(testsuite.root)
    elif not isinstance(testsuite, itsdb.TestSuite):
        testsuite = itsdb.TestSuite(testsuite)
    return tsql.select(dataspec, testsuite, mode=mode, cast=cast)


###############################################################################
### MKPROF ####################################################################

[docs]def mkprof(destination, source=None, relations=None, where=None,
           in_place=False, skeleton=False, full=False, gzip=False):
    """
    Create [incr tsdb()] profiles or skeletons.

    Data for the testsuite may come from an existing testsuite or from
    a list of sentences. There are four main usage patterns:

        - `source="testsuite/"` -- read data from `testsuite/`
        - `source=None, in_place=True` -- read data from *destination*
        - `source=None, in_place=False` -- read sentences from stdin
        - `source="sents.txt"` -- read sentences from `sents.txt`

    For the latter two, the *relations* parameter must be specified.

    Args:
        destination (str): path of the new testsuite
        source (str): path to a source testsuite or a file containing
            sentences; if not given and *in_place* is `False`,
            sentences are read from stdin
        relations (str): path to a relations file to use for the
            created testsuite; if `None` and *source* is given, the
            relations file of the source testsuite is used
        where (str): TSQL condition to filter records by; ignored if
            *source* is not a testsuite
        in_place (bool): if `True` and *source* is not given, use
            *destination* as the source for data (default: `False`)
        skeleton (bool): if `True`, only write tsdb-core files
            (default: `False`)
        full (bool): if `True`, copy all data from the source
            testsuite (requires *source* to be a testsuite path;
            default: `False`)
        gzip (bool): if `True`, non-empty tables will be compressed
            with gzip
    """
    # basic validation
    if skeleton and full:
        raise ValueError("'skeleton' is incompatible with 'full'")
    elif skeleton and in_place:
        raise ValueError("'skeleton' is incompatible with 'in_place'")
    elif in_place and source is not None:
        raise ValueError("'in_place' is incompatible with 'source'")
    if in_place:
        source = destination
    if full and (source is None or not os.path.isdir(source)):
        raise ValueError("'full' must be used with a source testsuite")
    if relations is None and source is not None and os.path.isdir(source):
        relations = os.path.join(source, 'relations')
    elif relations is None or not os.path.isfile(relations):
        raise ValueError('invalid or missing relations file: {}'
                         .format(relations))
    # setup destination testsuite
    _prepare_output_directory(destination)
    dts = itsdb.TestSuite(path=destination, relations=relations)
    # input is sentences on stdin
    if source is None:
        dts.write({'item': _lines_to_rows(sys.stdin, dts.relations)},
                  gzip=gzip)
    # input is sentence file
    elif os.path.isfile(source):
        with open(source) as fh:
            dts.write({'item': _lines_to_rows(fh, dts.relations)},
                      gzip=gzip)
    # input is source testsuite
    elif os.path.isdir(source):
        sts = itsdb.TestSuite(source)
        tables = dts.relations.tables if full else itsdb.tsdb_core_files
        where = '' if where is None else 'where ' + where
        for table in tables:
            if sts.size(table) > 0:
                # filter the data, but use all if the query fails
                # (e.g., if the filter and table cannot be joined)
                try:
                    rows = tsql.select(
                        '* from {} {}'.format(table, where), sts, cast=False)
                except itsdb.ItsdbError:
                    rows = sts[table]
                dts.write({table: rows}, gzip=gzip)
    dts.reload()
    # unless a skeleton was requested, make empty files for other tables
    if not skeleton:
        for table in dts.relations:
            if len(dts[table]) == 0:
                dts.write({table: []})

    # summarize what was done
    if sys.stdout.isatty():
        _red = lambda s: '\x1b[1;31m{}\x1b[0m'.format(s)
    else:
        _red = lambda s: s
    fmt = '{:>8} bytes\t{}'
    for filename in ['relations'] + list(dts.relations.tables):
        path = os.path.join(destination, filename)
        if os.path.isfile(path):
            stat = os.stat(path)
            print(fmt.format(stat.st_size, filename))
        elif os.path.isfile(path + '.gz'):
            stat = os.stat(path + '.gz')
            print(fmt.format(stat.st_size, _red(filename + '.gz')))


def _lines_to_rows(lines, relations):
    # field indices only need to be computed once, so don't use
    # itsdb.Record.from_dict()
    i_id_idx = relations['item'].index('i-id')
    i_wf_idx = relations['item'].index('i-wf')
    i_input_idx = relations['item'].index('i-input')
    num_fields = len(relations['item'])

    def make_row(i_id, i_wf, i_input):
        row = [None] * num_fields
        row[i_id_idx] = i_id
        row[i_wf_idx] = i_wf
        row[i_input_idx] = i_input
        return itsdb.Record(relations['item'], row)

    for i, line in enumerate(lines):
        i_wf, i_input = (0, line[1:]) if line.startswith('*') else (1, line)
        yield make_row(i * 10, i_wf, i_input.strip())


###############################################################################
### PROCESS ###################################################################

[docs]def process(grammar, testsuite, source=None, select=None,
            generate=False, transfer=False, options=None,
            all_items=False, result_id=None, gzip=False):
    """
    Process (e.g., parse) a [incr tsdb()] profile.

    Results are written to directly to *testsuite*.

    If *select* is `None`, the defaults depend on the task:

        ==========  =========================
        Task        Default value of *select*
        ==========  =========================
        Parsing     `item:i-input`
        Transfer    `result:mrs`
        Generation  `result:mrs`
        ==========  =========================

    Args:
        grammar (str): path to a compiled grammar image
        testsuite (str): path to a [incr tsdb()] testsuite where data
            will be read from (see *source*) and written to
        source (str): path to a [incr tsdb()] testsuite; if `None`,
            *testsuite* is used as the source of data
        select (str): TSQL query for selecting processor inputs
            (default depends on the processor type)
        generate (bool): if `True`, generate instead of parse
            (default: `False`)
        transfer (bool): if `True`, transfer instead of parse
            (default: `False`)
        options (list): list of ACE command-line options to use when
            invoking the ACE subprocess; unsupported options will
            give an error message
        all_items (bool): if `True`, don't exclude ignored items
            (those with `i-wf==2`) when parsing
        result_id (int): if given, only keep items with the specified
            `result-id`
        gzip (bool): if `True`, non-empty tables will be compressed
            with gzip
    """
    from delphin.interfaces import ace

    if generate and transfer:
        raise ValueError("'generate' is incompatible with 'transfer'")
    if source is None:
        source = testsuite
    if select is None:
        select = 'result:mrs' if (generate or transfer) else 'item:i-input'
    if generate:
        processor = ace.AceGenerator
    elif transfer:
        processor = ace.AceTransferer
    else:
        if not all_items:
            select += ' where i-wf != 2'
        processor = ace.AceParser
    if result_id is not None:
        select += ' where result-id == {}'.format(result_id)

    source = itsdb.TestSuite(source)
    target = itsdb.TestSuite(testsuite)
    column, tablename, condition = _interpret_selection(select, source)
    table = itsdb.Table(
        source[tablename].fields,
        tsql.select(
            '* from {} {}'.format(tablename, condition),
            source,
            cast=False))

    with processor(grammar, cmdargs=options) as cpu:
        target.process(cpu, ':' + column, source=table, gzip=gzip)

def _interpret_selection(select, source):
    queryobj = tsql.inspect_query('select ' + select)
    projection = queryobj['projection']
    if projection == '*' or len(projection) != 1:
        raise ValueError("'select' must return a single column")
    tablename, _, column = projection[0].rpartition(':')
    if not tablename:
        # query could be 'i-input from item' instead of 'item:i-input'
        if len(queryobj['tables']) == 1:
            tablename = queryobj['tables'][0]
        # otherwise guess
        else:
            tablename = source.relations.find(column)[0]
    try:
        condition = select[select.index(' where ') + 1:]
    except ValueError:
        condition = ''
    return column, tablename, condition


###############################################################################
### REPP ######################################################################


[docs]def repp(file, config=None, module=None, active=None,
         format=None, trace_level=0):
    """
    Tokenize with a Regular Expression PreProcessor (REPP).

    Results are printed directly to stdout. If more programmatic
    access is desired, the :mod:`delphin.repp` module provides a
    similar interface.

    Args:
        file (str, file): filename, open file, or stream of sentence
            inputs
        config (str): path to a PET REPP configuration (.set) file
        module (str): path to a top-level REPP module; other modules
            are found by external group calls
        active (list): select which modules are active; if `None`, all
            are used; incompatible with *config* (default: `None`)
        format (str): the output format (`"yy"`, `"string"`, `"line"`,
            or `"triple"`; default: `"yy"`)
        trace_level (int): if `0` no trace info is printed; if `1`,
            applied rules are printed, if greather than `1`, both
            applied and unapplied rules (in order) are printed
            (default: `0`)
    """
    from delphin.repp import REPP

    if config is not None and module is not None:
        raise ValueError("cannot specify both 'config' and 'module'")
    if config is not None and active:
        raise ValueError("'active' cannot be used with 'config'")
    if config:
        r = REPP.from_config(config)
    elif module:
        r = REPP.from_file(module, active=active)
    else:
        r = REPP()  # just tokenize

    if hasattr(file, 'read'):
        for line in file:
            _repp(r, line, format, trace_level)
    else:
        with io.open(file, encoding='utf-8') as fh:
            for line in fh:
                _repp(r, line, format, trace_level)


def _repp(r, line, format, trace_level):
    if trace_level > 0:
        for step in r.trace(line.rstrip('\n'), verbose=True):
            if not hasattr(step, 'applied'):
                print('Done:{}'.format(step.string))
                continue
            if step.applied == True or trace_level > 1:
                print('{}:{!s}\n   In:{}\n  Out:{}'.format(
                    'Applied' if step.applied else 'Did not apply',
                    step.operation, step.input, step.output))
    res = r.tokenize(line.rstrip('\n'))
    if format == 'yy':
        print(res)
    elif format == 'string':
        print(' '.join(t.form for t in res.tokens))
    elif format == 'line':
        for t in res.tokens:
            print(t.form)
        print()
    elif format == 'triple':
        for t in res.tokens:
            if t.lnk.type == Lnk.CHARSPAN:
                cfrom, cto = t.lnk.data
            else:
                cfrom, cto = -1, -1
            print(
                '({}, {}, {})'
                .format(cfrom, cto, t.form)
            )
        print()


###############################################################################
### COMPARE ###################################################################

[docs]def compare(testsuite, gold, select='i-id i-input mrs'):
    """
    Compare two [incr tsdb()] profiles.

    Args:
        testsuite (str, TestSuite): path to the test [incr tsdb()]
            testsuite or a :class:`TestSuite` object
        gold (str, TestSuite): path to the gold [incr tsdb()]
            testsuite or a :class:`TestSuite` object
        select: TSQL query to select (id, input, mrs) triples
            (default: `i-id i-input mrs`)
    Yields:
        dict: Comparison results as::

            {"id": "item identifier",
             "input": "input sentence",
             "test": number_of_unique_results_in_test,
             "shared": number_of_shared_results,
             "gold": number_of_unique_results_in_gold}

    """
    from delphin.mrs import simplemrs, compare as mrs_compare

    if not isinstance(testsuite, itsdb.TestSuite):
        if isinstance(testsuite, itsdb.ItsdbProfile):
            testsuite = testsuite.root
        testsuite = itsdb.TestSuite(testsuite)
    if not isinstance(gold, itsdb.TestSuite):
        if isinstance(gold, itsdb.ItsdbProfile):
            gold = gold.root
        gold = itsdb.TestSuite(gold)

    queryobj = tsql.inspect_query('select ' + select)
    if len(queryobj['projection']) != 3:
        raise ValueError('select does not return 3 fields: ' + select)

    input_select = '{} {}'.format(queryobj['projection'][0],
                                  queryobj['projection'][1])
    i_inputs = dict(tsql.select(input_select, testsuite))

    matched_rows = itsdb.match_rows(
        tsql.select(select, testsuite),
        tsql.select(select, gold),
        0)

    for (key, testrows, goldrows) in matched_rows:
        (test_unique, shared, gold_unique) = mrs_compare.compare_bags(
            [simplemrs.loads_one(row[2]) for row in testrows],
            [simplemrs.loads_one(row[2]) for row in goldrows])
        yield {'id': key,
               'input': i_inputs[key],
               'test': test_unique,
               'shared': shared,
               'gold': gold_unique}


###############################################################################
### HELPER FUNCTIONS ##########################################################


def _prepare_output_directory(path):
    try:
        os.makedirs(path)  # exist_ok=True is available from Python 3.2
    except OSError as ex:  # PermissionError is available from Python 3.3
        if ex.errno == 17 and os.path.isdir(path):
            pass  # existing directory; maybe it's usable
        else:
            raise