Source code for delphin.semi


"""
Semantic Interface (SEM-I)
"""

import re
import warnings
from collections.abc import Mapping, Sequence
from itertools import zip_longest
from operator import itemgetter
from pathlib import Path

from delphin import hierarchy

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import (
    PyDelphinException,
    PyDelphinSyntaxError,
    PyDelphinWarning,
)
from delphin.predicate import normalize as normalize_predicate

TOP_TYPE = '*top*'
STRING_TYPE = 'string'


_SEMI_SECTIONS = (
    'variables',
    'properties',
    'roles',
    'predicates',
)

_variable_entry_re = re.compile(
    r'(?P<var>[^ .]+)'
    r'(?: < (?P<parents>[^ &:.]+(?: & [^ &:.]+)*))?'
    r'(?: : (?P<properties>[^ ]+ [^ ,.]+(?:, [^ ]+ [^ ,.]+)*))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_property_entry_re = re.compile(
    r'(?P<type>[^ .]+)'
    r'(?: < (?P<parents>[^ &.]+(?: & [^ &.]+)*))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_role_entry_re = re.compile(
    r'(?P<role>[^ ]+) : (?P<value>[^ .]+)\s*\.\s*(?:;.*)?$',
    re.U
)

_predicate_entry_re = re.compile(
    r'(?P<pred>[^ ]+)'
    r'(?: < (?P<parents>[^ &:.;]+(?: & [^ &:.;]+)*))?'
    r'(?: : (?P<synposis>.*[^ .;]))?'
    r'\s*\.\s*(?:;.*)?$',
    re.U
)

_synopsis_re = re.compile(
    r'\s*(?P<optional>\[\s*)?'
    r'(?P<name>[^ ]+) (?P<value>[^ ,.{\]]+)'
    r'(?:\s*\{\s*(?P<properties>[^ ]+ [^ ,}]+(?:, [^ ]+ [^ ,}]+)*)\s*\})?'
    r'(?(optional)\s*\])'
    r'(?:\s*(?:,\s*|$))',
    re.U
)


[docs] class SemIError(PyDelphinException): """Raised when loading an invalid SEM-I."""
[docs] class SemISyntaxError(PyDelphinSyntaxError): """Raised when loading an invalid SEM-I."""
[docs] class SemIWarning(PyDelphinWarning): """Warning class for questionable SEM-Is."""
[docs] def load(source, encoding='utf-8'): """ Interpret and return the SEM-I defined at path *source*. Args: source: the path of the top file for the SEM-I. Note: this must be a path and not an open file. encoding (str): the character encoding of the file Returns: The SemI defined by *source* """ path = Path(source).expanduser() data = _read_file(path, path.parent, encoding) return SemI(**data)
def _read_file(path, basedir, encoding): data = { 'variables': {}, 'properties': {}, 'roles': {}, 'predicates': {}, } section = None for lineno, line in enumerate(path.open(encoding=encoding), 1): line = line.lstrip() if not line or line.startswith(';'): continue match = re.match(r'(?P<name>[^: ]+):\s*$', line) if match is not None: name = match.group('name') if name not in _SEMI_SECTIONS: raise SemISyntaxError( 'invalid SEM-I section', filename=str(path), lineno=lineno, text=line) else: section = name continue match = re.match(r'include:\s*(?P<filename>.+)$', line, flags=re.U) if match is not None: include = basedir.joinpath(match.group('filename').rstrip()) include_data = _read_file( include, include.parent, encoding) for key, val in include_data['variables'].items(): _incorporate(data['variables'], key, val, include) for key, val in include_data['properties'].items(): _incorporate(data['properties'], key, val, include) for key, val in include_data['roles'].items(): _incorporate(data['roles'], key, val, include) for pred, d in include_data['predicates'].items(): if pred not in data['predicates']: data['predicates'][pred] = { 'parents': [], 'synopses': [] } if d.get('parents'): data['predicates'][pred]['parents'] = d['parents'] if d.get('synopses'): data['predicates'][pred]['synopses'].extend(d['synopses']) elif section == 'variables': # e.g. e < i : PERF bool, TENSE tense. match = _variable_entry_re.match(line) if match is not None: identifier = match.group('var') supertypes = match.group('parents') or [] if supertypes: supertypes = supertypes.split(' & ') properties = match.group('properties') or [] if properties: pairs = properties.split(', ') properties = [pair.split() for pair in pairs] v = {'parents': supertypes, 'properties': properties} # v = type(identifier, supertypes, d) _incorporate(data['variables'], identifier, v, path) else: raise SemISyntaxError( 'invalid variable', filename=str(path), lineno=lineno, text=line) elif section == 'properties': # e.g. + < bool. match = _property_entry_re.match(line) if match is not None: _type = match.group('type') supertypes = match.group('parents') or [] if supertypes: supertypes = supertypes.split(' & ') _incorporate( data['properties'], _type, {'parents': supertypes}, path) else: raise SemISyntaxError( 'invalid property', filename=str(path), lineno=lineno, text=line) elif section == 'roles': # e.g. + < bool. match = _role_entry_re.match(line) if match is not None: role, value = match.group('role'), match.group('value') _incorporate(data['roles'], role, {'value': value}, path) else: raise SemISyntaxError( 'invalid role', filename=str(path), lineno=lineno, text=line) elif section == 'predicates': # e.g. _predicate_n_1 : ARG0 x { IND + }. match = _predicate_entry_re.match(line) if match is not None: pred = match.group('pred') if pred not in data['predicates']: data['predicates'][pred] = { 'parents': [], 'synopses': [] } sups = match.group('parents') if sups: data['predicates'][pred]['parents'] = sups.split(' & ') synposis = match.group('synposis') roles = [] if synposis: for rolematch in _synopsis_re.finditer(synposis): d = rolematch.groupdict() propstr = d['properties'] or '' d['properties'] = dict( pair.split() for pair in propstr.split(', ') if pair.strip() != '') d['optional'] = bool(d['optional']) roles.append(d) data['predicates'][pred]['synopses'].append( {'roles': roles}) return data def _incorporate(d, key, val, path): if key in d: warnings.warn( f"'{key}' redefined in {path}", SemIWarning, stacklevel=2, ) d[key] = val
[docs] class SynopsisRole(tuple): """ Role data associated with a SEM-I predicate synopsis. Args: name (str): the role name value (str): the role value (variable type or `"string"`) properties (dict): properties associated with the role's value optional (bool): a flag indicating if the role is optional Example: >>> role = SynopsisRole('ARG0', 'x', {'PERS': '3'}, False) """ name = property(itemgetter(0), doc='The role name.') value = property( itemgetter(1), doc='The role value (variable type or "string"') properties = property(itemgetter(2), doc='Property-value map.') optional = property(itemgetter(3), doc="`True` if the role is optional.") def __new__(cls, name, value, properties=None, optional=False): if not properties: properties = {} else: properties = {prop.upper(): val.lower() for prop, val in dict(properties).items()} return super().__new__(cls, ([name.upper(), value.lower(), properties, bool(optional)])) def __repr__(self): return 'SynopsisRole({}, {}, {}, {})'.format( self.name, self.value, self.properties, self.optional) def _to_dict(self): d = {"name": self.name, "value": self.value} if self.properties: d['properties'] = dict(self.properties) if self.optional: d['optional'] = True return d @classmethod def _from_dict(cls, d): return cls(d['name'], d['value'], d.get('properties', []), d.get('optional', False))
[docs] class Synopsis(tuple): """ A SEM-I predicate synopsis. A synopsis describes the roles of a predicate in a semantic structure, so it is no more than a tuple of roles as :class:`SynopsisRole` objects. The length of the synopsis is thus the arity of a predicate while the individual role items detail the role names, argument types, associated properties, and optionality. """ def __repr__(self): return 'Synopsis([{}])'.format(', '.join(map(repr, self)))
[docs] @classmethod def from_dict(cls, d): """ Create a Synopsis from its dictionary representation. Example: >>> synopsis = Synopsis.from_dict({ ... 'roles': [ ... {'name': 'ARG0', 'value': 'e'}, ... {'name': 'ARG1', 'value': 'x', ... 'properties': {'NUM': 'sg'}} ... ] ... }) ... >>> len(synopsis) 2 """ return cls(SynopsisRole._from_dict(role) for role in d.get('roles', []))
[docs] def to_dict(self): """ Return a dictionary representation of the Synopsis. Example: >>> Synopsis([ ... SynopsisRole('ARG0', 'e'), ... SynopsisRole('ARG1', 'x', {'NUM': 'sg'}) ... ]).to_dict() {'roles': [{'name': 'ARG0', 'value': 'e'}, {'name': 'ARG1', 'value': 'x', 'properties': {'NUM': 'sg'}}]} """ return {'roles': [role._to_dict() for role in self]}
[docs] def subsumes(self, args, variables=None): """ Return `True` if the Synopsis subsumes *args*. The *args* argument is a description of MRS arguments. It may take two different forms: - a sequence (e.g., string or list) of variable types, e.g., `"exh"`, which must be subsumed by the role values of the synopsis in order - a mapping (e.g., a dict) of roles to variable types which must match roles in the synopsis; the variable type may be `None` which matches any role value In both cases, the sequence or mapping must be a subset of the roles of the synopsis, and any missing must be optional roles, otherwise the synopsis does not subsume *args*. The *variables* argument is a variable hierarchy. If it is `None`, variables will be checked for strict equality. """ if len(args) > len(self): return False # some arg won't be in the synopsis # normalize input if isinstance(args, Sequence): vartypes = (v.lower() if v else None for v in args) roleargs = list(zip_longest([], vartypes, self)) elif isinstance(args, Mapping): name_to_roles = {d.name: d for d in self} roleargs = [] for role in set(args).union(name_to_roles): role = role.upper() v = args.get(role, '') v = v.lower() if v else None roleargs.append((role, v, name_to_roles.get(role))) else: raise TypeError(args.__class__.__name__) # per-role checks for role, arg, synrole in roleargs: if synrole is None: return False # unmatched role in args elif role is None and arg is None and not synrole.optional: return False # unmatched synopsis role # elif role is not None and role != synrole.name: # return False # invalid role in sequence elif arg is not None: if variables is not None: if not variables.subsumes(synrole.value, arg): return False elif synrole.value != arg: return False # all tests passed return True
[docs] class SemI: """ A semantic interface. SEM-Is describe the semantic inventory for a grammar. These include the variable types, valid properties for variables, valid roles for predications, and a lexicon of predicates with associated roles. Args: variables: a mapping of (var, {'parents': [...], 'properties': [...]}) properties: a mapping of (prop, {'parents': [...]}) roles: a mapping of (role, {'value': ...}) predicates: a mapping of (pred, {'parents': [...], 'synopses': [...]}) Attributes: variables: a :class:`~delphin.hierarchy.MultiHierarchy` of variables; node data contains the property lists properties: a :class:`~delphin.hierarchy.MultiHierarchy` of properties roles: mapping of role names to allowed variable types predicates: a :class:`~delphin.hierarchy.MultiHierarchy` of predicates; node data contains lists of synopses """ def __init__(self, variables=None, properties=None, roles=None, predicates=None): self.properties = _new_hierarchy() self.variables = _new_hierarchy() self.roles = {} self.predicates = _new_hierarchy() # validate and normalize inputs if properties: self._init_properties(properties) if variables: self._init_variables(variables) if roles: self._init_roles(roles) if predicates: self._init_predicates(predicates) def _init_properties(self, properties): subhier = {prop: data.get('parents') or TOP_TYPE for prop, data in properties.items()} self.properties.update(subhierarchy=subhier) def _init_variables(self, variables): subhier, data = {}, {} for var, var_data in variables.items(): properties = [] for k, v in var_data.get('properties', []): k, v = k.upper(), v.lower() if v not in self.properties: raise SemIError(f'undefined property value: {v}') properties.append((k, v)) subhier[var] = var_data.get('parents') or TOP_TYPE data[var] = properties self.variables.update(subhierarchy=subhier, data=data) def _init_roles(self, roles): for role, data in roles.items(): role = role.upper() var = data['value'].lower() if not (var == STRING_TYPE or var in self.variables): raise SemIError(f'undefined variable type: {var}') self.roles[role] = var def _init_predicates(self, predicates): subhier, data = {}, {} propcache = {v: dict(props or []) for v, props in self.variables.items()} for pred, pred_data in predicates.items(): synopses = [] for synopsis_data in pred_data.get('synopses', []): synopses.append( self._init_synopsis(pred, synopsis_data, propcache)) subhier[pred] = pred_data.get('parents') or TOP_TYPE data[pred] = synopses self.predicates.update(subhierarchy=subhier, data=data) def _init_synopsis(self, pred, synopsis_data, propcache): synopsis = Synopsis.from_dict(synopsis_data) for role in synopsis: if role.name not in self.roles: raise SemIError(f'{pred}: undefined role: {role.name}') if role.value == STRING_TYPE: if role.properties: raise SemIError( f'{pred}: strings cannot define properties') elif role.value not in self.variables: raise SemIError( f'{pred}: undefined variable type: {role.value}') else: for k, v in role.properties.items(): if v not in self.properties: raise SemIError( f'{pred}: undefined property value: {v}') if k not in propcache[role.value]: # Just warn because of the current situation where # 'i' variables are used for unexpressed 'x's warnings.warn( "{}: property '{}' not allowed on '{}'" .format(pred, k, role.value), SemIWarning, stacklevel=2, ) else: _v = propcache[role.value][k] if not self.properties.compatible(v, _v): raise SemIError( '{}: incompatible property values: {}, {}' .format(pred, v, _v)) return synopsis
[docs] @classmethod def from_dict(cls, d): """Instantiate a SemI from a dictionary representation.""" return cls(**d)
[docs] def to_dict(self): """Return a dictionary representation of the SemI.""" def add_parents(d, ps): if ps and list(ps) != [TOP_TYPE]: d['parents'] = list(ps) variables = {} for var, data in self.variables.items(): variables[var] = d = {} add_parents(d, self.variables.parents(var)) if data: d['properties'] = list(map(list, data)) properties = {} for prop in self.properties: properties[prop] = d = {} add_parents(d, self.properties.parents(prop)) roles = {role: {'value': value} for role, value in self.roles.items()} predicates = {} for pred, data in self.predicates.items(): predicates[pred] = d = {} add_parents(d, self.predicates.parents(pred)) if data: d['synopses'] = [synopsis.to_dict() for synopsis in data] return {'variables': variables, 'properties': properties, 'roles': roles, 'predicates': predicates}
[docs] def find_synopsis(self, predicate, args=None): """ Return the first matching synopsis for *predicate*. *predicate* will be normalized before lookup. Synopses can be matched by a description of arguments which is tested with :meth:`Synopsis.subsumes`. If no condition is given, the first synopsis is returned. Args: predicate: predicate symbol whose synopsis will be returned args: description of arguments that must be subsumable by the synopsis Returns: matching synopsis as a list of `(role, value, properties, optional)` role tuples Raises: :class:`SemIError`: if *predicate* is undefined or if no matching synopsis can be found Example: >>> smi.find_synopsis('_write_v_to') [('ARG0', 'e', [], False), ('ARG1', 'i', [], False), ('ARG2', 'p', [], True), ('ARG3', 'h', [], True)] >>> smi.find_synopsis('_write_v_to', args='eii') [('ARG0', 'e', [], False), ('ARG1', 'i', [], False), ('ARG2', 'i', [], False)] """ predicate = normalize_predicate(predicate) if predicate not in self.predicates: raise SemIError(f'undefined predicate: {predicate}') found = False for synopsis in self.predicates[predicate]: if not args or synopsis.subsumes(args, self.variables): found = synopsis break if found is False: raise SemIError('no valid synopsis for {}({})' .format(predicate, repr(args) if args else '')) return found
def _new_hierarchy(): return hierarchy.MultiHierarchy(TOP_TYPE, normalize_identifier=str.lower)