Source code for delphin.lnk


"""
Surface alignment for semantic entities.
"""

from typing import Iterable, Optional, Tuple, Union, overload

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinException



[docs]
class LnkError(PyDelphinException):
    """Raised on invalid Lnk values or operations."""




[docs]
class Lnk:
    """
    Surface-alignment information for predications.

    Lnk objects link predicates to the surface form in one of several
    ways, the most common of which being the character span of the
    original string.

    Valid types and their associated *data* shown in the table below.

    =============  ===================  =========
    type           data                 example
    =============  ===================  =========
    Lnk.CHARSPAN   surface string span  (0, 5)
    Lnk.CHARTSPAN  chart vertex span    (0, 5)
    Lnk.TOKENS     token identifiers    (0, 1, 2)
    Lnk.EDGE       edge identifier      1
    =============  ===================  =========

    Args:
        arg: Lnk type or the string representation of a Lnk
        data: alignment data (assumes *arg* is a Lnk type)
    Attributes:
        type: the way the Lnk relates the semantics to the surface form
        data: the alignment data (depends on the Lnk type)

    Example:

        >>> Lnk('<0:5>').data
        (0, 5)
        >>> str(Lnk.charspan(0,5))
        '<0:5>'
        >>> str(Lnk.chartspan(0,5))
        '<0#5>'
        >>> str(Lnk.tokens([0,1,2]))
        '<0 1 2>'
        >>> str(Lnk.edge(1))
        '<@1>'
    """

    __slots__ = ('type', 'data')

    type: int
    data: Union[int, Tuple[int, ...]]

    # These types determine how a lnk on an EP or MRS are to be
    # interpreted, and thus determine the data type/structure of the
    # lnk data.
    UNSPECIFIED = 0
    CHARSPAN = 1  # Character span; a pair of offsets
    CHARTSPAN = 2  # Chart vertex span: a pair of indices
    TOKENS = 3  # Token numbers: a list of indices
    EDGE = 4  # An edge identifier: a number

    @overload
    def __init__(self, arg: None, data: None = None):
        ...

    @overload
    def __init__(self, arg: str, data: None = None):
        ...

    @overload
    def __init__(self,
                 arg: int,
                 data: Union[None, int, Tuple[int, ...]] = None):
        ...

    def __init__(self, arg, data=None):
        if not arg:
            self.type = Lnk.UNSPECIFIED
            self.data = None
        elif data is None and (arg[:1], arg[-1:]) == ('<', '>'):
            arg = arg[1:-1]
            if arg.startswith('@'):
                self.type = Lnk.EDGE
                self.data = int(arg[1:])
            elif ':' in arg:
                cfrom, cto = arg.split(':')
                self.type = Lnk.CHARSPAN
                self.data = (int(cfrom), int(cto))
            elif '#' in arg:
                vfrom, vto = arg.split('#')
                self.type = Lnk.CHARTSPAN
                self.data = (int(vfrom), int(vto))
            else:
                self.type = Lnk.TOKENS
                self.data = tuple(map(int, arg.split()))
        elif arg in (Lnk.CHARSPAN, Lnk.CHARTSPAN, Lnk.TOKENS, Lnk.EDGE):
            self.type = arg
            self.data = data
        else:
            raise LnkError('invalid Lnk: {!r}'.format((arg, data)))


[docs]
    @classmethod
    def default(cls):
        """
        Create a Lnk object for when no information is given.
        """
        return cls(None)



[docs]
    @classmethod
    def charspan(cls, start: Union[str, int], end: Union[str, int]):
        """
        Create a Lnk object for a character span.

        Args:
            start: the initial character position (cfrom)
            end: the final character position (cto)
        """
        return cls(Lnk.CHARSPAN, (int(start), int(end)))



[docs]
    @classmethod
    def chartspan(cls, start: Union[str, int], end: Union[str, int]):
        """
        Create a Lnk object for a chart span.

        Args:
            start: the initial chart vertex
            end: the final chart vertex
        """
        return cls(Lnk.CHARTSPAN, (int(start), int(end)))



[docs]
    @classmethod
    def tokens(cls, tokens: Iterable[Union[str, int]]):
        """
        Create a Lnk object for a token range.

        Args:
            tokens: a list of token identifiers
        """
        return cls(Lnk.TOKENS, tuple(map(int, tokens)))



[docs]
    @classmethod
    def edge(cls, edge: Union[str, int]):
        """
        Create a Lnk object for an edge (used internally in generation).

        Args:
            edge: an edge identifier
        """
        return cls(Lnk.EDGE, int(edge))


    def __str__(self):
        if self.type == Lnk.UNSPECIFIED:
            return ''
        elif self.type == Lnk.CHARSPAN:
            return '<{}:{}>'.format(self.data[0], self.data[1])
        elif self.type == Lnk.CHARTSPAN:
            return '<{}#{}>'.format(self.data[0], self.data[1])
        elif self.type == Lnk.EDGE:
            return '<@{}>'.format(self.data)
        elif self.type == Lnk.TOKENS:
            return '<{}>'.format(' '.join(map(str, self.data)))

    def __repr__(self):
        return f'<Lnk object {self!s} at {id(self)}>'

    def __eq__(self, other):
        return self.type == other.type and self.data == other.data

    def __bool__(self):
        if self.type == Lnk.UNSPECIFIED:
            return False
        if self.type == Lnk.CHARSPAN and self.data == (-1, -1):
            return False
        return True




[docs]
class LnkMixin:
    """
    A mixin class for adding `cfrom` and `cto` properties on structures.
    """

    __slots__ = ('lnk', 'surface')

    def __init__(self,
                 lnk: Optional[Lnk] = None,
                 surface: Optional[str] = None):
        if lnk is None:
            lnk = Lnk.default()
        self.lnk = lnk
        self.surface = surface

    @property
    def cfrom(self) -> int:
        """
        The initial character position in the surface string.

        Defaults to -1 if there is no valid cfrom value.
        """
        cfrom = -1
        try:
            if self.lnk.type == Lnk.CHARSPAN:
                cfrom = self.lnk.data[0]  # type: ignore
        except AttributeError:
            pass  # use default cfrom of -1
        return cfrom

    @property
    def cto(self) -> int:
        """
        The final character position in the surface string.

        Defaults to -1 if there is no valid cto value.
        """
        cto = -1
        try:
            if self.lnk.type == Lnk.CHARSPAN:
                cto = self.lnk.data[1]  # type: ignore
        except AttributeError:
            pass  # use default cto of -1
        return cto