Source code for delphin.semi


"""
Semantic Interface (SEM-I)
"""

import re
import warnings
from collections.abc import Mapping, Sequence
from itertools import zip_longest
from operator import itemgetter
from pathlib import Path

from delphin import hierarchy

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import (
    PyDelphinException,
    PyDelphinSyntaxError,
    PyDelphinWarning,
)
from delphin.predicate import normalize as normalize_predicate

TOP_TYPE = '*top*'
STRING_TYPE = 'string'


_SEMI_SECTIONS = (
    'variables',
    'properties',
    'roles',
    'predicates',
)

_variable_entry_re = re.compile(
    r'(?P<var>[^ .]+)'
    r'(?: < (?P<parents>[^ &:.]+(?: & [^ &:.]+)*))?'
    r'(?: : (?P<properties>[^ ]+ [^ ,.]+(?:, [^ ]+ [^ ,.]+)*))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_property_entry_re = re.compile(
    r'(?P<type>[^ .]+)'
    r'(?: < (?P<parents>[^ &.]+(?: & [^ &.]+)*))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_role_entry_re = re.compile(
    r'(?P<role>[^ ]+) : (?P<value>[^ .]+)\s*\.\s*(?:;.*)?$',
    re.U
)

_predicate_entry_re = re.compile(
    r'(?P<pred>[^ ]+)'
    r'(?: < (?P<parents>[^ &:.;]+(?: & [^ &:.;]+)*))?'
    r'(?: : (?P<synposis>.*[^ .;]))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_synopsis_re = re.compile(
    r'\s*(?P<optional>\[\s*)?'
    r'(?P<name>[^ ]+) (?P<value>[^ ,.{\]]+)'
    r'(?:\s*\{\s*(?P<properties>[^ ]+ [^ ,}]+(?:, [^ ]+ [^ ,}]+)*)\s*\})?'
    r'(?(optional)\s*\])'
    r'(?:\s*(?:,\s*|$))',
    re.U
)



[docs]
class SemIError(PyDelphinException):
    """Raised when loading an invalid SEM-I."""




[docs]
class SemISyntaxError(PyDelphinSyntaxError):
    """Raised when loading an invalid SEM-I."""




[docs]
class SemIWarning(PyDelphinWarning):
    """Warning class for questionable SEM-Is."""




[docs]
def load(source, encoding='utf-8'):
    """
    Interpret and return the SEM-I defined at path *source*.

    Args:
        source: the path of the top file for the SEM-I. Note: this
            must be a path and not an open file.
        encoding (str): the character encoding of the file
    Returns:
        The SemI defined by *source*
    """
    path = Path(source).expanduser()
    data = _read_file(path, path.parent, encoding)
    return SemI(**data)



def _read_file(path, basedir, encoding):
    data = {
        'variables': {},
        'properties': {},
        'roles': {},
        'predicates': {},
    }
    section = None

    for lineno, line in enumerate(path.open(encoding=encoding), 1):
        line = line.lstrip()

        if not line or line.startswith(';'):
            continue

        match = re.match(r'(?P<name>[^: ]+):\s*$', line)
        if match is not None:
            name = match.group('name')
            if name not in _SEMI_SECTIONS:
                raise SemISyntaxError(
                    'invalid SEM-I section',
                    filename=str(path), lineno=lineno, text=line)
            else:
                section = name
            continue

        match = re.match(r'include:\s*(?P<filename>.+)$', line, flags=re.U)
        if match is not None:
            include = basedir.joinpath(match.group('filename').rstrip())
            include_data = _read_file(
                include, include.parent, encoding)
            for key, val in include_data['variables'].items():
                _incorporate(data['variables'], key, val, include)
            for key, val in include_data['properties'].items():
                _incorporate(data['properties'], key, val, include)
            for key, val in include_data['roles'].items():
                _incorporate(data['roles'], key, val, include)
            for pred, d in include_data['predicates'].items():
                if pred not in data['predicates']:
                    data['predicates'][pred] = {
                        'parents': [],
                        'synopses': []
                    }
                if d.get('parents'):
                    data['predicates'][pred]['parents'] = d['parents']
                if d.get('synopses'):
                    data['predicates'][pred]['synopses'].extend(d['synopses'])

        elif section == 'variables':
            # e.g. e < i : PERF bool, TENSE tense.
            match = _variable_entry_re.match(line)
            if match is not None:
                identifier = match.group('var')
                supertypes = match.group('parents') or []
                if supertypes:
                    supertypes = supertypes.split(' & ')
                properties = match.group('properties') or []
                if properties:
                    pairs = properties.split(', ')
                    properties = [pair.split() for pair in pairs]
                v = {'parents': supertypes, 'properties': properties}
                # v = type(identifier, supertypes, d)
                _incorporate(data['variables'], identifier, v, path)
            else:
                raise SemISyntaxError(
                    'invalid variable',
                    filename=str(path), lineno=lineno, text=line)

        elif section == 'properties':
            # e.g. + < bool.
            match = _property_entry_re.match(line)
            if match is not None:
                _type = match.group('type')
                supertypes = match.group('parents') or []
                if supertypes:
                    supertypes = supertypes.split(' & ')
                _incorporate(
                    data['properties'], _type, {'parents': supertypes}, path)
            else:
                raise SemISyntaxError(
                    'invalid property',
                    filename=str(path), lineno=lineno, text=line)

        elif section == 'roles':
            # e.g. + < bool.
            match = _role_entry_re.match(line)
            if match is not None:
                role, value = match.group('role'), match.group('value')
                _incorporate(data['roles'], role, {'value': value}, path)
            else:
                raise SemISyntaxError(
                    'invalid role',
                    filename=str(path), lineno=lineno, text=line)

        elif section == 'predicates':
            # e.g. _predicate_n_1 : ARG0 x { IND + }.
            match = _predicate_entry_re.match(line)
            if match is not None:
                pred = match.group('pred')
                if pred not in data['predicates']:
                    data['predicates'][pred] = {
                        'parents': [],
                        'synopses': []
                    }
                sups = match.group('parents')
                if sups:
                    data['predicates'][pred]['parents'] = sups.split(' & ')
                synposis = match.group('synposis')
                roles = []
                if synposis:
                    for rolematch in _synopsis_re.finditer(synposis):
                        d = rolematch.groupdict()
                        propstr = d['properties'] or ''
                        d['properties'] = dict(
                            pair.split() for pair in propstr.split(', ')
                            if pair.strip() != '')
                        d['optional'] = bool(d['optional'])
                        roles.append(d)
                    data['predicates'][pred]['synopses'].append(
                        {'roles': roles})

    return data


def _incorporate(d, key, val, path):
    if key in d:
        warnings.warn(
            f"'{key}' redefined in {path}",
            SemIWarning,
            stacklevel=2,
        )
    d[key] = val



[docs]
class SynopsisRole(tuple):
    """
    Role data associated with a SEM-I predicate synopsis.

    Args:
        name (str): the role name
        value (str): the role value (variable type or `"string"`)
        properties (dict): properties associated with the role's value
        optional (bool): a flag indicating if the role is optional
    Example:

    >>> role = SynopsisRole('ARG0', 'x', {'PERS': '3'}, False)
    """

    name = property(itemgetter(0), doc='The role name.')
    value = property(
        itemgetter(1), doc='The role value (variable type or "string"')
    properties = property(itemgetter(2), doc='Property-value map.')
    optional = property(itemgetter(3), doc="`True` if the role is optional.")

    def __new__(cls, name, value, properties=None, optional=False):
        if not properties:
            properties = {}
        else:
            properties = {prop.upper(): val.lower()
                          for prop, val in dict(properties).items()}
        return super().__new__(cls, ([name.upper(),
                                      value.lower(),
                                      properties,
                                      bool(optional)]))

    def __repr__(self):
        return 'SynopsisRole({}, {}, {}, {})'.format(
            self.name, self.value, self.properties, self.optional)

    def _to_dict(self):
        d = {"name": self.name, "value": self.value}
        if self.properties:
            d['properties'] = dict(self.properties)
        if self.optional:
            d['optional'] = True
        return d

    @classmethod
    def _from_dict(cls, d):
        return cls(d['name'],
                   d['value'],
                   d.get('properties', []),
                   d.get('optional', False))




[docs]
class Synopsis(tuple):
    """
    A SEM-I predicate synopsis.

    A synopsis describes the roles of a predicate in a semantic
    structure, so it is no more than a tuple of roles as
    :class:`SynopsisRole` objects. The length of the synopsis is thus
    the arity of a predicate while the individual role items detail
    the role names, argument types, associated properties, and
    optionality.
    """

    def __repr__(self):
        return 'Synopsis([{}])'.format(', '.join(map(repr, self)))


[docs]
    @classmethod
    def from_dict(cls, d):
        """
        Create a Synopsis from its dictionary representation.

        Example:

        >>> synopsis = Synopsis.from_dict({
        ...     'roles': [
        ...         {'name': 'ARG0', 'value': 'e'},
        ...         {'name': 'ARG1', 'value': 'x',
        ...          'properties': {'NUM': 'sg'}}
        ...     ]
        ... })
        ...
        >>> len(synopsis)
        2
        """
        return cls(SynopsisRole._from_dict(role)
                   for role in d.get('roles', []))



[docs]
    def to_dict(self):
        """
        Return a dictionary representation of the Synopsis.

        Example:

        >>> Synopsis([
        ...     SynopsisRole('ARG0', 'e'),
        ...     SynopsisRole('ARG1', 'x', {'NUM': 'sg'})
        ... ]).to_dict()
        {'roles': [{'name': 'ARG0', 'value': 'e'},
                   {'name': 'ARG1', 'value': 'x',
                    'properties': {'NUM': 'sg'}}]}
        """

        return {'roles': [role._to_dict() for role in self]}



[docs]
    def subsumes(self, args, variables=None):
        """
        Return `True` if the Synopsis subsumes *args*.

        The *args* argument is a description of MRS arguments. It may
        take two different forms:

        - a sequence (e.g., string or list) of variable types, e.g.,
          `"exh"`, which must be subsumed by the role values of the
          synopsis in order

        - a mapping (e.g., a dict) of roles to variable types which
          must match roles in the synopsis; the variable type may be
          `None` which matches any role value

        In both cases, the sequence or mapping must be a subset of the
        roles of the synopsis, and any missing must be optional roles,
        otherwise the synopsis does not subsume *args*.

        The *variables* argument is a variable hierarchy. If it is
        `None`, variables will be checked for strict equality.
        """
        if len(args) > len(self):
            return False  # some arg won't be in the synopsis
        # normalize input
        if isinstance(args, Sequence):
            vartypes = (v.lower() if v else None for v in args)
            roleargs = list(zip_longest([], vartypes, self))
        elif isinstance(args, Mapping):
            name_to_roles = {d.name: d for d in self}
            roleargs = []
            for role in set(args).union(name_to_roles):
                role = role.upper()
                v = args.get(role, '')
                v = v.lower() if v else None
                roleargs.append((role, v, name_to_roles.get(role)))
        else:
            raise TypeError(args.__class__.__name__)

        # per-role checks
        for role, arg, synrole in roleargs:
            if synrole is None:
                return False  # unmatched role in args
            elif role is None and arg is None and not synrole.optional:
                return False  # unmatched synopsis role
            # elif role is not None and role != synrole.name:
            #     return False  # invalid role in sequence
            elif arg is not None:
                if variables is not None:
                    if not variables.subsumes(synrole.value, arg):
                        return False
                elif synrole.value != arg:
                    return False
        # all tests passed
        return True





[docs]
class SemI:
    """
    A semantic interface.

    SEM-Is describe the semantic inventory for a grammar. These include
    the variable types, valid properties for variables, valid roles
    for predications, and a lexicon of predicates with associated roles.

    Args:
        variables: a mapping of (var, {'parents': [...], 'properties': [...]})
        properties: a mapping of (prop, {'parents': [...]})
        roles: a mapping of (role, {'value': ...})
        predicates: a mapping of (pred, {'parents': [...], 'synopses': [...]})
    Attributes:
        variables: a :class:`~delphin.hierarchy.MultiHierarchy` of variables;
            node data contains the property lists
        properties: a :class:`~delphin.hierarchy.MultiHierarchy` of properties
        roles: mapping of role names to allowed variable types
        predicates: a :class:`~delphin.hierarchy.MultiHierarchy` of predicates;
            node data contains lists of synopses
    """

    def __init__(self,
                 variables=None,
                 properties=None,
                 roles=None,
                 predicates=None):
        self.properties = _new_hierarchy()
        self.variables = _new_hierarchy()
        self.roles = {}
        self.predicates = _new_hierarchy()
        # validate and normalize inputs
        if properties:
            self._init_properties(properties)
        if variables:
            self._init_variables(variables)
        if roles:
            self._init_roles(roles)
        if predicates:
            self._init_predicates(predicates)

    def _init_properties(self, properties):
        subhier = {prop: data.get('parents') or TOP_TYPE
                   for prop, data in properties.items()}
        self.properties.update(subhierarchy=subhier)

    def _init_variables(self, variables):
        subhier, data = {}, {}
        for var, var_data in variables.items():
            properties = []
            for k, v in var_data.get('properties', []):
                k, v = k.upper(), v.lower()
                if v not in self.properties:
                    raise SemIError(f'undefined property value: {v}')
                properties.append((k, v))
            subhier[var] = var_data.get('parents') or TOP_TYPE
            data[var] = properties
        self.variables.update(subhierarchy=subhier, data=data)

    def _init_roles(self, roles):
        for role, data in roles.items():
            role = role.upper()
            var = data['value'].lower()
            if not (var == STRING_TYPE or var in self.variables):
                raise SemIError(f'undefined variable type: {var}')
            self.roles[role] = var

    def _init_predicates(self, predicates):
        subhier, data = {}, {}
        propcache = {v: dict(props or [])
                     for v, props in self.variables.items()}
        for pred, pred_data in predicates.items():
            synopses = []
            for synopsis_data in pred_data.get('synopses', []):
                synopses.append(
                    self._init_synopsis(pred, synopsis_data, propcache))
            subhier[pred] = pred_data.get('parents') or TOP_TYPE
            data[pred] = synopses
        self.predicates.update(subhierarchy=subhier, data=data)

    def _init_synopsis(self, pred, synopsis_data, propcache):
        synopsis = Synopsis.from_dict(synopsis_data)
        for role in synopsis:
            if role.name not in self.roles:
                raise SemIError(f'{pred}: undefined role: {role.name}')
            if role.value == STRING_TYPE:
                if role.properties:
                    raise SemIError(
                        f'{pred}: strings cannot define properties')
            elif role.value not in self.variables:
                raise SemIError(
                    f'{pred}: undefined variable type: {role.value}')
            else:
                for k, v in role.properties.items():
                    if v not in self.properties:
                        raise SemIError(
                            f'{pred}: undefined property value: {v}')
                    if k not in propcache[role.value]:
                        # Just warn because of the current situation where
                        # 'i' variables are used for unexpressed 'x's
                        warnings.warn(
                            "{}: property '{}' not allowed on '{}'"
                            .format(pred, k, role.value),
                            SemIWarning,
                            stacklevel=2,
                        )
                    else:
                        _v = propcache[role.value][k]
                        if not self.properties.compatible(v, _v):
                            raise SemIError(
                                '{}: incompatible property values: {}, {}'
                                .format(pred, v, _v))
        return synopsis


[docs]
    @classmethod
    def from_dict(cls, d):
        """Instantiate a SemI from a dictionary representation."""
        return cls(**d)



[docs]
    def to_dict(self):
        """Return a dictionary representation of the SemI."""

        def add_parents(d, ps):
            if ps and list(ps) != [TOP_TYPE]:
                d['parents'] = list(ps)

        variables = {}
        for var, data in self.variables.items():
            variables[var] = d = {}
            add_parents(d, self.variables.parents(var))
            if data:
                d['properties'] = list(map(list, data))

        properties = {}
        for prop in self.properties:
            properties[prop] = d = {}
            add_parents(d, self.properties.parents(prop))

        roles = {role: {'value': value} for role, value in self.roles.items()}

        predicates = {}
        for pred, data in self.predicates.items():
            predicates[pred] = d = {}
            add_parents(d, self.predicates.parents(pred))
            if data:
                d['synopses'] = [synopsis.to_dict() for synopsis in data]

        return {'variables': variables,
                'properties': properties,
                'roles': roles,
                'predicates': predicates}



[docs]
    def find_synopsis(self, predicate, args=None):
        """
        Return the first matching synopsis for *predicate*.

        *predicate* will be normalized before lookup.

        Synopses can be matched by a description of arguments which is
        tested with :meth:`Synopsis.subsumes`. If no condition is
        given, the first synopsis is returned.

        Args:
            predicate: predicate symbol whose synopsis will be returned
            args: description of arguments that must be subsumable by
                the synopsis
        Returns:
            matching synopsis as a list of `(role, value, properties,
            optional)` role tuples
        Raises:
            :class:`SemIError`: if *predicate* is undefined or if no
                matching synopsis can be found
        Example:
            >>> smi.find_synopsis('_write_v_to')
            [('ARG0', 'e', [], False), ('ARG1', 'i', [], False),
             ('ARG2', 'p', [], True), ('ARG3', 'h', [], True)]
            >>> smi.find_synopsis('_write_v_to', args='eii')
            [('ARG0', 'e', [], False), ('ARG1', 'i', [], False),
             ('ARG2', 'i', [], False)]
        """

        predicate = normalize_predicate(predicate)
        if predicate not in self.predicates:
            raise SemIError(f'undefined predicate: {predicate}')
        found = False
        for synopsis in self.predicates[predicate]:
            if not args or synopsis.subsumes(args, self.variables):
                found = synopsis
                break
        if found is False:
            raise SemIError('no valid synopsis for {}({})'
                            .format(predicate, repr(args) if args else ''))
        return found




def _new_hierarchy():
    return hierarchy.MultiHierarchy(TOP_TYPE, normalize_identifier=str.lower)