Source code for delphin.semi

"""
Semantic Interface (SEM-I)
"""

import re
import warnings
from collections.abc import Mapping, Sequence
from itertools import zip_longest
from operator import itemgetter
from pathlib import Path

from delphin import hierarchy

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import (
    PyDelphinException,
    PyDelphinSyntaxError,
    PyDelphinWarning,
)
from delphin.predicate import normalize as normalize_predicate

TOP_TYPE = "*top*"
STRING_TYPE = "string"


_SEMI_SECTIONS = (
    "variables",
    "properties",
    "roles",
    "predicates",
)

_variable_entry_re = re.compile(
    r"(?P<var>[^ .]+)"
    r"(?: < (?P<parents>[^ &:.]+(?: & [^ &:.]+)*))?"
    r"(?: : (?P<properties>[^ ]+ [^ ,.]+(?:, [^ ]+ [^ ,.]+)*))?"
    r"\s*\.\s*(?:;.*)?$",
    re.U,
)

_property_entry_re = re.compile(
    r"(?P<type>[^ .]+)"
    r"(?: < (?P<parents>[^ &.]+(?: & [^ &.]+)*))?"
    r"\s*\.\s*(?:;.*)?$",
    re.U,
)

_role_entry_re = re.compile(
    r"(?P<role>[^ ]+) : (?P<value>[^ .]+)\s*\.\s*(?:;.*)?$", re.U
)

_predicate_entry_re = re.compile(
    r"(?P<pred>[^ ]+)"
    r"(?: < (?P<parents>[^ &:.;]+(?: & [^ &:.;]+)*))?"
    r"(?: : (?P<synposis>.*[^ .;]))?"
    r"\s*\.\s*(?:;.*)?$",
    re.U,
)

_synopsis_re = re.compile(
    r"\s*(?P<optional>\[\s*)?"
    r"(?P<name>[^ ]+) (?P<value>[^ ,.{\]]+)"
    r"(?:\s*\{\s*(?P<properties>[^ ]+ [^ ,}]+(?:, [^ ]+ [^ ,}]+)*)\s*\})?"
    r"(?(optional)\s*\])"
    r"(?:\s*(?:,\s*|$))",
    re.U,
)


[docs] class SemIError(PyDelphinException): """Raised when loading an invalid SEM-I."""
[docs] class SemISyntaxError(PyDelphinSyntaxError): """Raised when loading an invalid SEM-I."""
[docs] class SemIWarning(PyDelphinWarning): """Warning class for questionable SEM-Is."""
[docs] def load(source, encoding="utf-8"): """ Interpret and return the SEM-I defined at path *source*. Args: source: the path of the top file for the SEM-I. Note: this must be a path and not an open file. encoding (str): the character encoding of the file Returns: The SemI defined by *source* """ path = Path(source).expanduser() data = _read_file(path, path.parent, encoding) return SemI(**data)
def _read_file(path, basedir, encoding): data = { "variables": {}, "properties": {}, "roles": {}, "predicates": {}, } section = None for lineno, line in enumerate(path.open(encoding=encoding), 1): line = line.lstrip() if not line or line.startswith(";"): continue match = re.match(r"(?P<name>[^: ]+):\s*$", line) if match is not None: name = match.group("name") if name not in _SEMI_SECTIONS: raise SemISyntaxError( "invalid SEM-I section", filename=str(path), lineno=lineno, text=line, ) else: section = name continue match = re.match(r"include:\s*(?P<filename>.+)$", line, flags=re.U) if match is not None: include = basedir.joinpath(match.group("filename").rstrip()) include_data = _read_file(include, include.parent, encoding) for key, val in include_data["variables"].items(): _incorporate(data["variables"], key, val, include) for key, val in include_data["properties"].items(): _incorporate(data["properties"], key, val, include) for key, val in include_data["roles"].items(): _incorporate(data["roles"], key, val, include) for pred, d in include_data["predicates"].items(): if pred not in data["predicates"]: data["predicates"][pred] = {"parents": [], "synopses": []} if d.get("parents"): data["predicates"][pred]["parents"] = d["parents"] if d.get("synopses"): data["predicates"][pred]["synopses"].extend(d["synopses"]) elif section == "variables": # e.g. e < i : PERF bool, TENSE tense. match = _variable_entry_re.match(line) if match is not None: identifier = match.group("var") supertypes = match.group("parents") or [] if supertypes: supertypes = supertypes.split(" & ") properties = match.group("properties") or [] if properties: pairs = properties.split(", ") properties = [pair.split() for pair in pairs] v = {"parents": supertypes, "properties": properties} _incorporate(data["variables"], identifier, v, path) else: raise SemISyntaxError( "invalid variable", filename=str(path), lineno=lineno, text=line ) elif section == "properties": # e.g. + < bool. match = _property_entry_re.match(line) if match is not None: _type = match.group("type") supertypes = match.group("parents") or [] if supertypes: supertypes = supertypes.split(" & ") _incorporate(data["properties"], _type, {"parents": supertypes}, path) else: raise SemISyntaxError( "invalid property", filename=str(path), lineno=lineno, text=line ) elif section == "roles": # e.g. + < bool. match = _role_entry_re.match(line) if match is not None: role, value = match.group("role"), match.group("value") _incorporate(data["roles"], role, {"value": value}, path) else: raise SemISyntaxError( "invalid role", filename=str(path), lineno=lineno, text=line ) elif section == "predicates": # e.g. _predicate_n_1 : ARG0 x { IND + }. match = _predicate_entry_re.match(line) if match is not None: pred = match.group("pred") if pred not in data["predicates"]: data["predicates"][pred] = {"parents": [], "synopses": []} sups = match.group("parents") if sups: data["predicates"][pred]["parents"] = sups.split(" & ") synposis = match.group("synposis") roles = [] if synposis: for rolematch in _synopsis_re.finditer(synposis): d = rolematch.groupdict() propstr = d["properties"] or "" d["properties"] = dict( pair.split() for pair in propstr.split(", ") if pair.strip() != "" ) d["optional"] = bool(d["optional"]) roles.append(d) data["predicates"][pred]["synopses"].append({"roles": roles}) return data def _incorporate(d, key, val, path): if key in d: warnings.warn( f"'{key}' redefined in {path}", SemIWarning, stacklevel=2, ) d[key] = val
[docs] class SynopsisRole(tuple): """ Role data associated with a SEM-I predicate synopsis. Args: name (str): the role name value (str): the role value (variable type or `"string"`) properties (dict): properties associated with the role's value optional (bool): a flag indicating if the role is optional Example: >>> role = SynopsisRole("ARG0", "x", {"PERS": "3"}, False) """ name = property(itemgetter(0), doc="The role name.") value = property(itemgetter(1), doc='The role value (variable type or "string"') properties = property(itemgetter(2), doc="Property-value map.") optional = property(itemgetter(3), doc="`True` if the role is optional.") def __new__(cls, name, value, properties=None, optional=False): if not properties: properties = {} else: properties = { prop.upper(): val.lower() for prop, val in dict(properties).items() } return super().__new__( cls, ([name.upper(), value.lower(), properties, bool(optional)]) ) def __repr__(self) -> str: return f"SynopsisRole({', '.join(self)})" def _to_dict(self): d = {"name": self.name, "value": self.value} if self.properties: d["properties"] = dict(self.properties) if self.optional: d["optional"] = True return d @classmethod def _from_dict(cls, d): return cls( d["name"], d["value"], d.get("properties", []), d.get("optional", False) )
[docs] class Synopsis(tuple): """ A SEM-I predicate synopsis. A synopsis describes the roles of a predicate in a semantic structure, so it is no more than a tuple of roles as :class:`SynopsisRole` objects. The length of the synopsis is thus the arity of a predicate while the individual role items detail the role names, argument types, associated properties, and optionality. """ def __repr__(self): return "Synopsis([{}])".format(", ".join(map(repr, self)))
[docs] @classmethod def from_dict(cls, d): """ Create a Synopsis from its dictionary representation. Example: >>> synopsis = Synopsis.from_dict( ... { ... "roles": [ ... {"name": "ARG0", "value": "e"}, ... {"name": "ARG1", "value": "x", "properties": {"NUM": "sg"}}, ... ] ... } ... ) >>> len(synopsis) 2 """ return cls(SynopsisRole._from_dict(role) for role in d.get("roles", []))
[docs] def to_dict(self): """ Return a dictionary representation of the Synopsis. Example: >>> Synopsis( ... [SynopsisRole("ARG0", "e"), SynopsisRole("ARG1", "x", {"NUM": "sg"})] ... ).to_dict() {'roles': [{'name': 'ARG0', 'value': 'e'}, {'name': 'ARG1', 'value': 'x', 'properties': {'NUM': 'sg'}}]} """ return {"roles": [role._to_dict() for role in self]}
[docs] def subsumes(self, args, variables=None): """ Return `True` if the Synopsis subsumes *args*. The *args* argument is a description of MRS arguments. It may take two different forms: - a sequence (e.g., string or list) of variable types, e.g., `"exh"`, which must be subsumed by the role values of the synopsis in order - a mapping (e.g., a dict) of roles to variable types which must match roles in the synopsis; the variable type may be `None` which matches any role value In both cases, the sequence or mapping must be a subset of the roles of the synopsis, and any missing must be optional roles, otherwise the synopsis does not subsume *args*. The *variables* argument is a variable hierarchy. If it is `None`, variables will be checked for strict equality. """ if len(args) > len(self): return False # some arg won't be in the synopsis # normalize input if isinstance(args, Sequence): vartypes = (v.lower() if v else None for v in args) roleargs = list(zip_longest([], vartypes, self)) elif isinstance(args, Mapping): name_to_roles = {d.name: d for d in self} roleargs = [] for role in set(args).union(name_to_roles): role = role.upper() v = args.get(role, "") v = v.lower() if v else None roleargs.append((role, v, name_to_roles.get(role))) else: raise TypeError(args.__class__.__name__) # per-role checks for role, arg, synrole in roleargs: if synrole is None: return False # unmatched role in args elif role is None and arg is None and not synrole.optional: return False # unmatched synopsis role elif arg is not None: if variables is not None: if not variables.subsumes(synrole.value, arg): return False elif synrole.value != arg: return False # all tests passed return True
[docs] class SemI: """ A semantic interface. SEM-Is describe the semantic inventory for a grammar. These include the variable types, valid properties for variables, valid roles for predications, and a lexicon of predicates with associated roles. Args: variables: a mapping of (var, {'parents': [...], 'properties': [...]}) properties: a mapping of (prop, {'parents': [...]}) roles: a mapping of (role, {'value': ...}) predicates: a mapping of (pred, {'parents': [...], 'synopses': [...]}) Attributes: variables: a :class:`~delphin.hierarchy.MultiHierarchy` of variables; node data contains the property lists properties: a :class:`~delphin.hierarchy.MultiHierarchy` of properties roles: mapping of role names to allowed variable types predicates: a :class:`~delphin.hierarchy.MultiHierarchy` of predicates; node data contains lists of synopses """ def __init__(self, variables=None, properties=None, roles=None, predicates=None): self.properties = _new_hierarchy() self.variables = _new_hierarchy() self.roles = {} self.predicates = _new_hierarchy() # validate and normalize inputs if properties: self._init_properties(properties) if variables: self._init_variables(variables) if roles: self._init_roles(roles) if predicates: self._init_predicates(predicates) def _init_properties(self, properties): subhier = { prop: data.get("parents") or TOP_TYPE for prop, data in properties.items() } self.properties.update(subhierarchy=subhier) def _init_variables(self, variables): subhier, data = {}, {} for var, var_data in variables.items(): properties = [] for k, v in var_data.get("properties", []): k, v = k.upper(), v.lower() if v not in self.properties: raise SemIError(f"undefined property value: {v}") properties.append((k, v)) subhier[var] = var_data.get("parents") or TOP_TYPE data[var] = properties self.variables.update(subhierarchy=subhier, data=data) def _init_roles(self, roles): for role, data in roles.items(): role = role.upper() var = data["value"].lower() if not (var == STRING_TYPE or var in self.variables): raise SemIError(f"undefined variable type: {var}") self.roles[role] = var def _init_predicates(self, predicates): subhier, data = {}, {} propcache = {v: dict(props or []) for v, props in self.variables.items()} for pred, pred_data in predicates.items(): synopses = [] for synopsis_data in pred_data.get("synopses", []): synopses.append(self._init_synopsis(pred, synopsis_data, propcache)) subhier[pred] = pred_data.get("parents") or TOP_TYPE data[pred] = synopses self.predicates.update(subhierarchy=subhier, data=data) def _init_synopsis(self, pred, synopsis_data, propcache): synopsis = Synopsis.from_dict(synopsis_data) for role in synopsis: if role.name not in self.roles: raise SemIError(f"{pred}: undefined role: {role.name}") if role.value == STRING_TYPE: if role.properties: raise SemIError(f"{pred}: strings cannot define properties") elif role.value not in self.variables: raise SemIError(f"{pred}: undefined variable type: {role.value}") else: for k, v in role.properties.items(): if v not in self.properties: raise SemIError(f"{pred}: undefined property value: {v}") if k not in propcache[role.value]: # Just warn because of the current situation where # 'i' variables are used for unexpressed 'x's warnings.warn( f"{pred}: property '{k}' not allowed on '{role.value}'", SemIWarning, stacklevel=2, ) else: _v = propcache[role.value][k] if not self.properties.compatible(v, _v): raise SemIError( f"{pred}: incompatible property values: {v}, {_v}" ) return synopsis
[docs] @classmethod def from_dict(cls, d): """Instantiate a SemI from a dictionary representation.""" return cls(**d)
[docs] def to_dict(self): """Return a dictionary representation of the SemI.""" def add_parents(d, ps): if ps and list(ps) != [TOP_TYPE]: d["parents"] = list(ps) variables = {} for var, data in self.variables.items(): variables[var] = d = {} add_parents(d, self.variables.parents(var)) if data: d["properties"] = list(map(list, data)) properties = {} for prop in self.properties: properties[prop] = d = {} add_parents(d, self.properties.parents(prop)) roles = {role: {"value": value} for role, value in self.roles.items()} predicates = {} for pred, data in self.predicates.items(): predicates[pred] = d = {} add_parents(d, self.predicates.parents(pred)) if data: d["synopses"] = [synopsis.to_dict() for synopsis in data] return { "variables": variables, "properties": properties, "roles": roles, "predicates": predicates, }
[docs] def find_synopsis(self, predicate, args=None): """ Return the first matching synopsis for *predicate*. *predicate* will be normalized before lookup. Synopses can be matched by a description of arguments which is tested with :meth:`Synopsis.subsumes`. If no condition is given, the first synopsis is returned. Args: predicate: predicate symbol whose synopsis will be returned args: description of arguments that must be subsumable by the synopsis Returns: matching synopsis as a list of `(role, value, properties, optional)` role tuples Raises: :class:`SemIError`: if *predicate* is undefined or if no matching synopsis can be found Example: >>> smi.find_synopsis("_write_v_to") [('ARG0', 'e', [], False), ('ARG1', 'i', [], False), ('ARG2', 'p', [], True), ('ARG3', 'h', [], True)] >>> smi.find_synopsis("_write_v_to", args="eii") [('ARG0', 'e', [], False), ('ARG1', 'i', [], False), ('ARG2', 'i', [], False)] """ predicate = normalize_predicate(predicate) if predicate not in self.predicates: raise SemIError(f"undefined predicate: {predicate}") found = False for synopsis in self.predicates[predicate]: if not args or synopsis.subsumes(args, self.variables): found = synopsis break if found is False: raise SemIError( "no valid synopsis for {}({})".format( predicate, repr(args) if args else "" ) ) return found
def _new_hierarchy(): return hierarchy.MultiHierarchy(TOP_TYPE, normalize_identifier=str.lower)