Source code for delphin.derivation

# coding: utf-8

"""
Classes and functions related to derivation trees.
"""

import re
from collections import namedtuple, Sequence

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinSyntaxError


class DerivationSyntaxError(PyDelphinSyntaxError):
    """Raised when parsing an invalid UDF string."""


_terminal_fields = ('form', 'tokens')
_token_fields = ('id', 'tfs')
_nonterminal_fields = ('id', 'entity', 'score', 'start', 'end', 'daughters')
_udx_fields = ('head', 'type')
_all_fields = tuple(
    set(_terminal_fields)
    .union(_nonterminal_fields)
    .union(_udx_fields)
)


[docs]def from_string(s):
    """
    Instantiate a Derivation from a UDF or UDX string representation.

    The UDF/UDX representations are as output by a processor like the
    `LKB <http://moin.delph-in.net/LkbTop>`_ or
    `ACE <http://sweaglesw.org/linguistics/ace/>`_, or from the
    :meth:`UDFNode.to_udf` or :meth:`UDFNode.to_udx` methods.

    Args:
        s (str): UDF or UDX serialization
    """
    udfnode = _from_string(s)
    return Derivation(*udfnode, head=udfnode._head, type=udfnode.type)


[docs]def from_dict(d):
    """
    Instantiate a Derivation from a dictionary representation.

    The dictionary representation may come from the HTTP interface
    (see the `ErgApi <http://moin.delph-in.net/ErgApi>`_ wiki) or
    from the :meth:`UDFNode.to_dict` method. Note that in the
    former case, the JSON response should have already been decoded
    into a Python dictionary.

    Args:
        d (dict): dictionary representation of a derivation
    """
    return Derivation(*_from_dict(d))


class _UDFNodeBase(object):
    """
    Base class for :class:`UDFNode` and :class:`UDFTerminal`.
    """
    def __str__(self):
        return self.to_udf(indent=None)

    # cannot rely on default __ne__ while namedtuple is a shared base class
    def __ne__(self, other):
        if not isinstance(other, _UDFNodeBase):
            return NotImplemented
        return not (self == other)

    # serialization

    def to_udf(self, indent=1):
        """
        Encode the node and its descendants in the UDF format.

        Args:
            indent (int): the number of spaces to indent at each level
        Returns:
            str: the UDF-serialized string
        """
        return _to_udf(self, indent, 1)

    def to_udx(self, indent=1):
        """
        Encode the node and its descendants in the UDF export format.

        Args:
            indent (int): the number of spaces to indent at each level
        Returns:
            str: the UDX-serialized string
        """
        return _to_udf(self, indent, 1, udx=True)

    def to_dict(self, fields=_all_fields, labels=None):
        """
        Encode the node as a dictionary suitable for JSON serialization.

        Args:
            fields: if given, this is a whitelist of fields to include
                on nodes (`daughters` and `form` are always shown)
            labels: optional label annotations to embed in the
                derivation dict; the value is a list of lists matching
                the structure of the derivation (e.g.,
                `["S" ["NP" ["NNS" ["Dogs"]]] ["VP" ["VBZ" ["bark"]]]]`)
        Returns:
            dict: the dictionary representation of the structure
        """
        return _to_dict(self, fields, labels)


[docs]class UDFToken(namedtuple('UDFToken', _token_fields)):
    """
    A token represenatation in derivations.

    Token data are not formally nodes, but do have an `id`. Most
    :class:`UDFTerminal` nodes will only have one UDFToken, but
    multi-word entities (e.g. "ad hoc") will have more than one.

    Args:
        id (int): token identifier
        tfs (str): the feature structure for the token
    """
    def __new__(cls, id, tfs):
        if id is not None:
            id = int(id)
        return super(UDFToken, cls).__new__(cls, id, tfs)

    def __repr__(self):
        return '<UDFToken object ({} {!r}) at {}>'.format(
            self.id, self.tfs, id(self)
        )

    def __eq__(self, other):
        """
        Token data are the same if they have the same feature structure.
        """
        if not isinstance(other, UDFToken):
            return NotImplemented
        return self.tfs == other.tfs


[docs]class UDFTerminal(_UDFNodeBase, namedtuple('UDFTerminal', _terminal_fields)):
    """
    Terminal nodes in the Unified Derivation Format.

    The *form* field is always set, but *tokens* may be `None`.

    See: http://moin.delph-in.net/ItsdbDerivations

    Args:
        form (str): surface form of the terminal
        tokens (list, optional): iterable of tokens
        parent (UDFNode, optional): parent node in derivation
    """

    def __new__(cls, form, tokens=None, parent=None):
        if tokens is None:
            tokens = []
        t = super(UDFTerminal, cls).__new__(cls, form, tokens)
        # internal bookkeeping
        t._parent = parent
        return t

    def __repr__(self):
        return '<UDFTerminal object ({}) at {}>'.format(self.form, id(self))

    def __eq__(self, other):
        """
        Terminal nodes are the same if they have the same form and
        token data.
        """
        if not isinstance(other, UDFTerminal):
            return NotImplemented
        if self.form != other.form:
            return False
        if self.tokens != other.tokens:
            return False
        return True

[docs]    def is_root(self):
        """
        Return `False` (as a `UDFTerminal` is never a root).

        This function is provided for convenience, so one does not need
        to check if `isinstance(n, UDFNode)` before testing if the node
        is a root.
        """
        return False


[docs]class UDFNode(_UDFNodeBase, namedtuple('UDFNode', _nonterminal_fields)):
    """
    Normal (non-leaf) nodes in the Unified Derivation Format.

    Root nodes are just UDFNodes whose `id`, by convention, is
    `None`. The `daughters` list can composed of either UDFNodes or
    other objects (generally it should be uniformly one or the other).
    In the latter case, the `UDFNode` is a preterminal, and the
    daughters are terminal nodes.

    Args:
        id (int): unique node identifier
        entity (str): grammar entity represented by the node
        score (float, optional): probability or weight of the node
        start (int, optional): start position of tokens encompassed by
            the node
        end (int, optional): end position of tokens encompassed by the
            node
        daughters (list, optional): iterable of daughter nodes
        head (bool, optional): `True` if the node is a syntactic head
            node
        type (str, optional): grammar type name
        parent (UDFNode, optional): parent node in derivation
    """

    def __new__(cls, id, entity,
                score=None, start=None, end=None, daughters=None,
                head=None, type=None, parent=None):
        # numeric fields can be underspecified as -1 if not a root
        if id is not None:
            id = int(id)
            score = -1.0 if score is None else float(score)
            start = -1 if start is None else int(start)
            end = -1 if end is None else int(end)
        # for convenience make sure daughters is a list if None
        if daughters is None:
            daughters = []
        # make sure daughters are not roots (is this check unnecessary?)
        if any(dtr.is_root() for dtr in daughters):
            raise ValueError('Daughter nodes cannot be roots.')
        node = super(UDFNode, cls).__new__(
            cls, id, entity, score, start, end, daughters
        )
        # internal bookkeeping
        node._parent = parent
        node._head = head
        node.type = type
        return node

    def __repr__(self):
        return '<UDFNode object ({}, {}, {}, {}, {}) at {}>'.format(
            self.id, self.entity, self.score, self.start, self.end, id(self)
        )

    def __eq__(self, other):
        """
        Two derivations are equal if their entities, tokenization, and
        daughters are the same. IDs and scores are irrelevant.
        """
        if not isinstance(other, UDFNode):
            return NotImplemented
        # Check attributes
        if self.entity.lower() != other.entity.lower():
            return False
        if self.type != other.type:
            return False
        if self.is_head() != other.is_head():
            return False
        if self.start != other.start or self.end != other.end:
            return False
        if len(self.daughters) != len(other.daughters):
            return False
        if any(a != b for a, b in zip(self.daughters, other.daughters)):
            return False
        # Return true if they're the same!
        return True

[docs]    def is_root(self):
        """
        Return `True` if the node is a root node.

        Note:
            This is not simply the top node; by convention, a node is
            a root if its `id` is `None`.
        """
        return self.id is None

    # UDX extensions

[docs]    def is_head(self):
        """
        Return `True` if the node is a head.

        A node is a head if it is marked as a head in the UDX format or
        it has no siblings. `False` is returned if the node is known
        to not be a head (has a sibling that is a head). Otherwise it
        is indeterminate whether the node is a head, and `None` is
        returned.

        """
        if (self._head or self.is_root()
                or len(getattr(self._parent, 'daughters', [None])) == 1):
            return True
        elif any(dtr._head for dtr in self._parent.daughters):
            return False
        return None

    # Convenience methods

[docs]    def preterminals(self):
        """
        Return the list of preterminals (i.e. lexical grammar-entities).
        """
        nodes = []
        for dtr in self.daughters:
            if isinstance(dtr, UDFTerminal):
                nodes.append(self)
            else:
                nodes.extend(dtr.preterminals())
        return nodes

[docs]    def terminals(self):
        """
        Return the list of terminals (i.e. lexical units).
        """
        nodes = []
        for dtr in self.daughters:
            if isinstance(dtr, UDFTerminal):
                nodes.append(dtr)
            else:
                nodes.extend(dtr.terminals())
        return nodes


[docs]class Derivation(UDFNode):
    """
    A [incr tsdb()] derivation.

    A Derivation object is simply a :class:`UDFNode` but as it is
    intended to represent an entire derivation tree it performs
    additional checks on instantiation if the top node is a root node,
    namely that the top node only has the *entity* attribute set, and
    that it has only one node on its *daughters* list.
    """

    def __init__(self, id, entity,
                 score=None, start=None, end=None, daughters=None,
                 head=None, type=None, parent=None):
        # Note: Attribute assignment is done in UDFNode.__new__(), so
        #       this only checks the arguments.
        # If id is None, it is a root, and score, start, and end must
        # all be None, and daughters must be a list with one UDFNode
        if id is None:
            if score is not None or start is not None or end is not None:
                raise TypeError(
                    'Root nodes (with id=None) of Derivation objects '
                    'must have *score*, *start*, and *end* set to None.'
                )
            if (daughters is None or len(daughters) != 1
                    or not isinstance(daughters[0], UDFNode)):
                raise ValueError(
                    'Root nodes (with id=None) of Derivation objects '
                    'must have a single daughter node.'
                )


###############################################################################
# Deserialization

# note that this regex doesn't have the initial open-parenthesis
# (see _from_string())
_udf_re = re.compile(
    # regular node
    r'\s*(?P<id>{token})\s+(?P<entity>{string}|{token})'
    r'\s+(?P<score>{token})\s+(?P<start>{token})'
    r'\s+(?P<end>{token})\s*\('
    # branch end
    r'|\s*(?P<done>\))'
    # terminal node (lexical token info; unbound list)
    r'|\s*(?P<form>{string})'
    # anything after form is optional
    r'('
    # LKB-style start/end (e.g. ("word" 1 2) )
    r'\s+(?P<lkb_start>\d+)\s+(?P<lkb_end>\d+)'
    # Token TFSs (e.g. ("word" 1 "token [ ... ]" 2 "token [... ]") )
    # usually there's only one, though
    r'|(?P<tokens>(?:\s+{token}\s+{string})*)'
    r')?'
    r'\s*\)'  # end terminal node
    # root symbol
    r'|\s*(?P<root>{token})\s*\(?'
    .format(token=r'[^\s()]+', string=r'"[^"\\]*(?:\\.[^"\\]*)*"')
)


def _from_string(s):
    if not (s.startswith('(') and s.endswith(')')):
        raise DerivationSyntaxError(
            'missing opening or closing parentheses', text=s)
    s_ = s[1:]  # get rid of initial open-parenthesis
    stack = []
    deriv = None
    matches = _udf_re.finditer(s_)
    for match in matches:
        if match.group('done'):
            node = stack.pop()
            if len(stack) == 0:
                deriv = node
                break
            else:
                stack[-1].daughters.append(node)
        elif match.group('form'):
            gd = match.groupdict()
            # ignore LKB-style start/end data if it exists on gd
            term = UDFTerminal(
                _unquote(gd['form']),
                tokens=_udf_tokens(gd.get('tokens')),
                parent=stack[-1] if stack else None
            )
            stack[-1].daughters.append(term)
        elif match.group('id'):
            gd = match.groupdict()
            head = None
            entity, _, type = gd['entity'].partition('@')
            if entity[0] == '^':
                entity = entity[1:]
                head = True
            if type == '':
                type = None
            udf = UDFNode(gd['id'], entity, gd['score'],
                          gd['start'], gd['end'],
                          head=head, type=type,
                          parent=stack[-1] if stack else None)
            stack.append(udf)
        elif match.group('root'):
            udf = UDFNode(None, match.group('root'))
            stack.append(udf)
    if deriv is None:
        raise DerivationSyntaxError(text=s)
    elif stack:
        raise DerivationSyntaxError(
            'possibly unbalanced parentheses', text=s)
    return deriv


def _unquote(s):
    if s is not None:
        return re.sub(r'^"(.*)"$', r'\1', s)
    return None


def _udf_tokens(tokenstring):
    tokens = []
    if tokenstring:
        toks = re.findall(
            r'\s*({id})\s+({tfs})'
            .format(id=r'\d+', tfs=r'"[^"\\]*(?:\\.[^"\\]*)*"'),
            tokenstring
        )
        for tid, tfs in toks:
            tokens.append(UDFToken(tid, _unquote(tfs)))
    return tokens


def _from_dict(d, parent=None):
    if 'daughters' in d:
        n = UDFNode(
            d.get('id'),
            d['entity'],
            score=d.get('score'),
            start=d.get('start'),
            end=d.get('end'),
            head=d.get('head'),
            type=d.get('type'),
            parent=parent
        )
        n.daughters.extend(
            _from_dict(dtr, parent=n) for dtr in d['daughters']
        )
        return n
    elif 'form' in d:
        n = UDFNode(
            d.get('id'),
            d['entity'],
            score=d.get('score'),
            start=d.get('start'),
            end=d.get('end'),
            head=d.get('head'),
            type=d.get('type'),
            parent=parent
        )
        n.daughters.append(
            UDFTerminal(
                form=d['form'],
                tokens=[UDFToken(t['id'], t['tfs'])
                        for t in d.get('tokens', [])],
                parent=n
            )
        )
        return n


###############################################################################
# Serialization

def _to_udf(obj, indent, level, udx=False):
    delim = ' ' if indent is None else '\n' + ' ' * indent * level
    if isinstance(obj, UDFNode):
        entity = obj.entity
        if udx:
            if obj._head:
                entity = '^' + entity
            if obj.type:
                entity = '{}@{}'.format(entity, obj.type)
        dtrs = [_to_udf(dtr, indent, level+1, udx) for dtr in obj.daughters]
        dtrs = delim.join([''] + dtrs)  # empty first item to force indent
        if obj.id is None:
            return '({}{})'.format(entity, dtrs)
        else:
            # :g for score makes -1.0 look like -1
            return '({} {} {:g} {} {}{})'.format(
                obj.id,
                entity,
                obj.score,
                obj.start,
                obj.end,
                dtrs
            )
    elif isinstance(obj, UDFTerminal):
        form = '"{}"'.format(obj.form)
        tokens = ['{} "{}"'.format(t.id, t.tfs) for t in obj.tokens]
        return '({})'.format(delim.join([form] + tokens))
    else:
        raise TypeError('Invalid node: {}'.format(str(obj)))


def _to_dict(obj, fields, labels):
    fields = set(fields)
    diff = fields.difference(_all_fields)
    if isinstance(labels, Sequence):
        labels = _map_labels(obj, labels)
    elif labels is None:
        labels = {}
    if diff:
        raise ValueError(
            'Invalid field(s): {}'.format(', '.join(diff))
        )
    return _to_dict_recursive(obj, fields, labels)


def _map_labels(drv, labels):
    m = {}
    if not labels:
        return m
    if labels[0]:
        m[drv.id] = labels[0]
    subds = getattr(drv, 'daughters', getattr(drv, 'tokens', []))
    sublbls = labels[1:]
    if (sublbls and len(subds) != len(sublbls)):
        raise ValueError('Labels do not match derivation structure.')
    for d, lbls in zip(subds, sublbls):
        if hasattr(d, 'id'):
            m.update(_map_labels(d, lbls))
    return m


def _to_dict_recursive(obj, fields, labels):
    d = {}
    if isinstance(obj, UDFNode):
        if 'entity' in fields:
            d['entity'] = obj.entity
        if obj.id is not None:
            if 'id' in fields:
                d['id'] = obj.id
            if 'score' in fields:
                d['score'] = obj.score
            if 'start' in fields:
                d['start'] = obj.start
            if 'end' in fields:
                d['end'] = obj.end
            if 'type' in fields and obj.type:
                d['type'] = obj.type
            if 'head' in fields and obj._head:
                d['head'] = obj._head
        dtrs = obj.daughters
        if dtrs:
            # terminals should always be single daughters
            if len(dtrs) == 1 and isinstance(dtrs[0], UDFTerminal):
                # merge terminal daughter info into current node
                d.update(_to_dict_recursive(dtrs[0], fields, labels))
            else:
                d['daughters'] = [
                    _to_dict_recursive(dtr, fields, labels) for dtr in dtrs
                ]
        if obj.id in labels:
            d['label'] = labels[obj.id]
    elif isinstance(obj, UDFTerminal):
        d['form'] = obj.form
        # d['from'] = min(t.tfs['+FROM'] for t in obj.tokens)
        # d['to'] = max(t.tfs['+TO'] for t in obj.tokens)
        if obj.tokens and 'tokens' in fields:
            tokens = []
            for tok in obj.tokens:
                td = {'id': tok.id}
                # td['from'] = tok.tfs['+FROM']
                # td['to'] = tok.tfs['+TO']
                td['tfs'] = tok.tfs
                tokens.append(td)
            d['tokens'] = tokens
    # else:
    #     raies TypeError()
    return d