Source code for delphin.tdl._model

from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import NamedTuple, TypeAlias, Union

from delphin import util
from delphin.tdl._exceptions import TDLError
from delphin.tfs import FeatureStructure

# Values for list expansion
LIST_TYPE = "*list*"  #: type of lists in TDL
EMPTY_LIST_TYPE = "*null*"  #: type of list terminators
LIST_HEAD = "FIRST"  #: feature for list items
LIST_TAIL = "REST"  #: feature for list tails
DIFF_LIST_LIST = "LIST"  #: feature for diff-list lists
DIFF_LIST_LAST = "LAST"  #: feature for the last path in a diff-list

AttrSeq: TypeAlias = Sequence[tuple[str, Union["Conjunction", "Term"]]]
AttrMap: TypeAlias = Mapping[str, Union["Conjunction", "Term"]]


# Classes for TDL entities


[docs] class Term: """ Base class for the terms of a TDL conjunction. All terms are defined to handle the binary '&' operator, which puts both into a Conjunction: >>> TypeIdentifier("a") & TypeIdentifier("b") <Conjunction object at 140008950372168> Args: docstring (str): documentation string Attributes: docstring (str): documentation string """ def __init__(self, docstring=None): self.docstring = docstring def __repr__(self): return f"<{type(self).__name__} object at {id(self)}>" def __and__(self, other): if isinstance(other, Term): return Conjunction([self, other]) elif isinstance(other, Conjunction): return Conjunction([self, *other._terms]) else: return NotImplemented
class TypeTerm(Term, str): """ Base class for type terms (identifiers, strings and regexes). This subclass of :class:`Term` also inherits from :py:class:`str` and forms the superclass of the string-based terms :class:`TypeIdentifier`, :class:`String`, and :class:`Regex`. Its purpose is to handle the correct instantiation of both the :class:`Term` and :py:class:`str` supertypes and to define equality comparisons such that different kinds of type terms with the same string value are not considered equal: >>> String("a") == String("a") True >>> String("a") == TypeIdentifier("a") False """ def __new__(cls, string, docstring=None): return str.__new__(cls, string) def __init__(self, string, docstring=None): super().__init__(docstring=docstring) def __repr__(self): return f"<{type(self).__name__} object ({self}) at {id(self)}>" def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented return str.__eq__(self, other) def __ne__(self, other): if not isinstance(other, self.__class__): return NotImplemented return str.__ne__(self, other)
[docs] class TypeIdentifier(TypeTerm): """ Type identifiers, or type names. Unlike other :class:`TypeTerms <TypeTerm>`, TypeIdentifiers use case-insensitive comparisons: >>> TypeIdentifier("MY-TYPE") == TypeIdentifier("my-type") True Args: string (str): type name docstring (str): documentation string Attributes: docstring (str): documentation string """ def __eq__(self, other): if isinstance(other, TypeTerm) and not isinstance(other, TypeIdentifier): return NotImplemented return self.lower() == other.lower() def __ne__(self, other): if isinstance(other, TypeTerm) and not isinstance(other, TypeIdentifier): return NotImplemented return self.lower() != other.lower()
[docs] class String(TypeTerm): """ Double-quoted strings. Args: string (str): type name docstring (str): documentation string Attributes: docstring (str): documentation string """
[docs] class Regex(TypeTerm): """ Regular expression patterns. Args: string (str): type name docstring (str): documentation string Attributes: docstring (str): documentation string """
[docs] class AVM(FeatureStructure, Term): """ A feature structure as used in TDL. Args: featvals (list, dict): a sequence of `(attribute, value)` pairs or an attribute to value mapping docstring (str): documentation string Attributes: docstring (str): documentation string """ def __init__( self, featvals: AttrSeq | AttrMap | None = None, docstring=None, ) -> None: # super() doesn't work because I need to split the parameters FeatureStructure.__init__(self) Term.__init__(self, docstring=docstring) if featvals is not None: self.aggregate(featvals) @classmethod def _default(cls): return _ImplicitAVM() def __setitem__(self, key: str, val: Union["Conjunction", Term]) -> None: if not (val is None or isinstance(val, (Term, Conjunction))): raise TypeError(f"invalid attribute value type: {type(val).__name__}") super().__setitem__(key, val)
[docs] def aggregate(self, featvals: AttrSeq | AttrMap) -> None: """Combine features in a single AVM. This function takes feature paths and values and merges them into the AVM, but does not do full unification. For example: >>> avm = tdl.AVM([("FEAT", tdl.TypeIdentifier("val1"))]) >>> avm.aggregate( ... [ ... ("FEAT", tdl.TypeIdentifier("val2")), ... ("FEAT.SUB", tdl.TypeIdentifier("val3")), ... ] ... ) >>> print(tdl.format(avm)) [ FEAT val1 & val2 & [ SUB val3 ] ] The *featvals* argument may be an sequence of (feature, value) pairs or a mapping of features to values. """ if hasattr(featvals, "items"): featvals = list(featvals.items()) for feat, val in featvals: avm = self feat = feat.upper() while feat: subkey, _, rest = feat.partition(".") cur_val = avm.get(subkey) # new feature, just assign if subkey not in avm: avm[feat] = val break # last feature on path, conjoin elif not rest: avm[subkey] = cur_val & val # non-conjunction implicit AVM; follow the dots elif isinstance(cur_val, _ImplicitAVM): avm = cur_val # conjunction with implicit AVM; follow the AVM's dots elif ( isinstance(cur_val, Conjunction) and (avm_ := cur_val._last_avm()) and isinstance(avm_, _ImplicitAVM) ): avm = avm_ # some other term; create conjunction with implicit AVM else: avm_ = _ImplicitAVM() avm[subkey] = cur_val & avm_ avm = avm_ feat = rest
[docs] def normalize(self): """ Reduce trivial AVM conjunctions to just the AVM. For example, in `[ ATTR1 [ ATTR2 val ] ]` the value of `ATTR1` could be a conjunction with the sub-AVM `[ ATTR2 val ]`. This method removes the conjunction so the sub-AVM nests directly (equivalent to `[ ATTR1.ATTR2 val ]` in TDL). """ for attr in self._avm: val = self._avm[attr] if isinstance(val, Conjunction): val.normalize() if len(val.terms) == 1 and isinstance(val.terms[0], AVM): self._avm[attr] = val.terms[0] elif isinstance(val, AVM): val.normalize()
[docs] def features(self, expand=False): """ Return the list of tuples of feature paths and feature values. Args: expand (bool): if `True`, expand all feature paths Example: >>> avm = AVM([('A.B', TypeIdentifier('1')), ... ('A.C', TypeIdentifier('2')]) >>> avm.features() [('A', <AVM object at ...>)] >>> avm.features(expand=True) [('A.B', <TypeIdentifier object (1) at ...>), ('A.C', <TypeIdentifier object (2) at ...>)] """ fs = [] for featpath, val in super().features(expand=expand): # don't juse Conjunction.features() here because we want to # include the non-AVM terms, too if expand and isinstance(val, Conjunction): for term in val.terms: if isinstance(term, AVM): for fp, v in term.features(True): fs.append((f"{featpath}.{fp}", v)) else: fs.append((featpath, term)) else: fs.append((featpath, val)) return fs
class _ImplicitAVM(AVM): """AVM implicitly constructed by dot-notation and list syntax."""
[docs] class ConsList(AVM): """ AVM subclass for cons-lists (``< ... >``) This provides a more intuitive interface for creating and accessing the values of list structures in TDL. Some combinations of the *values* and *end* parameters correspond to various TDL forms as described in the table below: ============ ======== ================= ====== TDL form values end state ============ ======== ================= ====== `< >` `None` `EMPTY_LIST_TYPE` closed `< ... >` `None` `LIST_TYPE` open `< a >` `[a]` `EMPTY_LIST_TYPE` closed `< a, b >` `[a, b]` `EMPTY_LIST_TYPE` closed `< a, ... >` `[a]` `LIST_TYPE` open `< a . b >` `[a]` `b` closed ============ ======== ================= ====== Args: values (list): a sequence of :class:`Conjunction` or :class:`Term` objects to be placed in the AVM of the list. end (str, :class:`Conjunction`, :class:`Term`): last item in the list (default: :data:`LIST_TYPE`) which determines if the list is open or closed docstring (str): documentation string Attributes: terminated (bool): if `False`, the list can be further extended by following the :data:`LIST_TAIL` features. docstring (str): documentation string """ def __init__(self, values=None, end=LIST_TYPE, docstring=None): super().__init__(docstring=docstring) if values is None: values = [] self._last_path = "" self.terminated = False for value in values: self.append(value) self.terminate(end) def __len__(self): return len(self.values())
[docs] def values(self): """ Return the list of values in the ConsList feature structure. """ if self._avm is None: return [] else: return [val for _, val in _collect_list_items(self)]
[docs] def append(self, value): """ Append an item to the end of an open ConsList. Args: value (:class:`Conjunction`, :class:`Term`): item to add Raises: :class:`TDLError`: when appending to a closed list """ if self._avm is not None and not self.terminated: path = self._last_path if path: path += "." self[path + LIST_HEAD] = value self._last_path = path + LIST_TAIL self[self._last_path] = _ImplicitAVM() else: raise TDLError("Cannot append to a closed list.")
[docs] def terminate(self, end): """ Set the value of the tail of the list. Adding values via :meth:`append` places them on the `FIRST` feature of some level of the feature structure (e.g., `REST.FIRST`), while :meth:`terminate` places them on the final `REST` feature (e.g., `REST.REST`). If *end* is a :class:`Conjunction` or :class:`Term`, it is typically a :class:`Coreference`, otherwise *end* is set to `tdl.EMPTY_LIST_TYPE` or `tdl.LIST_TYPE`. This method does not necessarily close the list; if *end* is `tdl.LIST_TYPE`, the list is left open, otherwise it is closed. Args: end (str, :class:`Conjunction`, :class:`Term`): value to use as the end of the list. """ if self.terminated: raise TDLError("Cannot terminate a closed list.") if end == LIST_TYPE: self.terminated = False elif end == EMPTY_LIST_TYPE: if self._last_path: self[self._last_path] = None else: self._avm = None self.terminated = True elif self._last_path: self[self._last_path] = end self.terminated = True else: raise TDLError(f"Empty list must be {LIST_TYPE} or {EMPTY_LIST_TYPE}")
[docs] class DiffList(AVM): """ AVM subclass for diff-lists (``<! ... !>``) As with :class:`ConsList`, this provides a more intuitive interface for creating and accessing the values of list structures in TDL. Unlike :class:`ConsList`, DiffLists are always closed lists with the last item coreferenced with the `LAST` feature, which allows for the joining of two diff-lists. Args: values (list): a sequence of :class:`Conjunction` or :class:`Term` objects to be placed in the AVM of the list docstring (str): documentation string Attributes: last (str): the feature path to the list position coreferenced by the value of the :data:`DIFF_LIST_LAST` feature. docstring (str): documentation string """ def __init__(self, values=None, docstring=None): cr = Coreference(None) if values: # use ConsList to construct the list, but discard the class tmplist = ConsList(values, end=cr) dl_list = _ImplicitAVM() dl_list._avm.update(tmplist._avm) self.last = "LIST." + tmplist._last_path else: dl_list = cr self.last = "LIST" dl_last = cr featvals = [(DIFF_LIST_LIST, dl_list), (DIFF_LIST_LAST, dl_last)] super().__init__(featvals, docstring=docstring) def __len__(self): return len(self.values())
[docs] def values(self): """ Return the list of values in the DiffList feature structure. """ if isinstance(self[DIFF_LIST_LIST], Coreference): vals = [] else: vals = [val for _, val in _collect_list_items(self.get(DIFF_LIST_LIST))] vals.pop() # last item of diff list is coreference return vals
def _collect_list_items(d): if not isinstance(d, AVM) or d.get(LIST_HEAD) is None: return [] vals = [(LIST_HEAD, d[LIST_HEAD])] rest = d[LIST_TAIL] if isinstance(rest, _ImplicitAVM): vals.extend( (LIST_TAIL + "." + path, val) for path, val in _collect_list_items(rest) ) elif rest is not None: vals.append((LIST_TAIL, rest)) return vals
[docs] class Coreference(Term): """ TDL coreferences, which represent re-entrancies in AVMs. Args: identifier (str): identifier or tag associated with the coreference; for internal use (e.g., in :class:`DiffList` objects), the identifier may be `None` docstring (str): documentation string Attributes: identifier (str): corefernce identifier or tag docstring (str): documentation string """ def __init__(self, identifier, docstring=None): super().__init__(docstring=docstring) self.identifier = identifier def __str__(self): if self.identifier is not None: return str(self.identifier) return ""
[docs] class Conjunction: """ Conjunction of TDL terms. Args: terms (list): sequence of :class:`Term` objects """ def __init__(self, terms=None): self._terms = [] if terms is not None: for term in terms: self.add(term) def __repr__(self): return f"<Conjunction object at {id(self)}>" def __and__(self, other): if isinstance(other, Conjunction): return Conjunction(self._terms + other._terms) elif isinstance(other, Term): return Conjunction([*self._terms, other]) else: return NotImplemented def __eq__(self, other): if isinstance(other, Term) and len(self._terms) == 1: return self._terms[0] == other elif not isinstance(other, Conjunction): return NotImplemented return self._terms == other._terms def __contains__(self, key): return any(key in term for term in self._terms if isinstance(term, AVM)) def __getitem__(self, key): """Get the value of *key* across all AVMs in the conjunction""" terms = [] for term in self._terms: if isinstance(term, AVM): val = term.get(key) if val is not None: terms.append(val) if len(terms) == 0: raise KeyError(key) elif len(terms) == 1: return terms[0] else: return Conjunction(terms) def __setitem__(self, key, val): """Set *key* to *val* in the last AVM in the conjunction""" if avm := self._last_avm(): avm[key] = val else: raise TDLError("no AVM in Conjunction") def __delitem__(self, key): """Delete *key* from all AVMs in the conjunction""" found = False for term in self._terms: if isinstance(term, AVM) and key in term: found = True del term[key] if not found: raise KeyError(key)
[docs] def get(self, key, default=None): """ Get the value of attribute *key* in any AVM in the conjunction. Args: key: attribute path to search default: value to return if *key* is not defined on any AVM """ try: return self[key] except KeyError: return default
[docs] def normalize(self): """ Rearrange the conjunction to a conventional form. This puts any coreference(s) first, followed by type terms, then followed by AVM(s) (including lists). AVMs are normalized via :meth:`AVM.normalize`. """ corefs = [] types = [] avms = [] for term in self._terms: if isinstance(term, TypeTerm): types.append(term) elif isinstance(term, AVM): term.normalize() avms.append(term) elif isinstance(term, Coreference): corefs.append(term) else: raise TDLError(f"unexpected term {term}") self._terms = corefs + types + avms
@property def terms(self): """The list of terms in the conjunction.""" return list(self._terms)
[docs] def add(self, term): """ Add a term to the conjunction. Args: term (:class:`Term`, :class:`Conjunction`): term to add; if a :class:`Conjunction`, all of its terms are added to the current conjunction. Raises: :class:`TypeError`: when *term* is an invalid type """ if isinstance(term, Conjunction): for term_ in term.terms: self.add(term_) elif isinstance(term, Term): self._terms.append(term) else: raise TypeError("Not a Term or Conjunction")
[docs] def types(self): """Return the list of type terms in the conjunction.""" return [ term for term in self._terms if isinstance(term, (TypeIdentifier, String, Regex)) ]
[docs] def features(self, expand=False): """Return the list of feature-value pairs in the conjunction.""" featvals = [] for term in self._terms: if isinstance(term, AVM): featvals.extend(term.features(expand=expand)) return featvals
[docs] def string(self): """ Return the first string term in the conjunction, or `None`. """ for term in self._terms: if isinstance(term, String): return str(term) return None # conjunction does not have a string type (not an error)
def _last_avm(self) -> AVM | None: for term in reversed(self._terms): if isinstance(term, AVM): return term return None
[docs] class TypeDefinition: """ A top-level Conjunction with an identifier. Args: identifier (str): type name conjunction (:class:`Conjunction`, :class:`Term`): type constraints docstring (str): documentation string Attributes: identifier (str): type identifier conjunction (:class:`Conjunction`): type constraints docstring (str): documentation string """ _operator = ":=" def __init__(self, identifier, conjunction, docstring=None): self.identifier = identifier if isinstance(conjunction, Term): conjunction = Conjunction([conjunction]) assert isinstance(conjunction, Conjunction) self.conjunction = conjunction self.docstring = docstring def __repr__(self): return f"<{type(self).__name__} object '{self.identifier}' at {id(self)}>" @property def supertypes(self): """The list of supertypes for the type.""" return self.conjunction.types()
[docs] def features(self, expand=False): """Return the list of feature-value pairs in the conjunction.""" return self.conjunction.features(expand=expand)
def __contains__(self, key): return key in self.conjunction def __getitem__(self, key): return self.conjunction[key] def __setitem__(self, key, value): self.conjunction[key] = value def __delitem__(self, key): del self.conjunction[key]
[docs] def documentation(self, level="first"): """ Return the documentation of the type. By default, this is the first docstring on a top-level term. By setting *level* to `"top"`, the list of all docstrings on top-level terms is returned, including the type's `docstring` value, if not `None`, as the last item. The docstring for the type itself is available via :attr:`TypeDefinition.docstring`. Args: level (str): `"first"` or `"top"` Returns: a single docstring or a list of docstrings """ docs = ( t.docstring for t in [*list(self.conjunction.terms), self] if t.docstring is not None ) if level.lower() == "first": doc = next(docs, None) elif level.lower() == "top": doc = list(docs) return doc
[docs] class TypeAddendum(TypeDefinition): """ An addendum to an existing type definition. Type addenda, unlike :class:`type definitions <TypeDefinition>`, do not require supertypes, or even any feature constraints. An addendum, however, must have at least one supertype, AVM, or docstring. Args: identifier (str): type name conjunction (:class:`Conjunction`, :class:`Term`): type constraints docstring (str): documentation string Attributes: identifier (str): type identifier conjunction (:class:`Conjunction`): type constraints docstring (str): documentation string """ _operator = ":+" def __init__(self, identifier, conjunction=None, docstring=None): if conjunction is None: conjunction = Conjunction() super().__init__(identifier, conjunction, docstring)
[docs] class LexicalRuleDefinition(TypeDefinition): """ An inflecting lexical rule definition. Args: identifier (str): type name affix_type (str): `"prefix"` or `"suffix"` patterns (list): sequence of `(match, replacement)` pairs conjunction (:class:`Conjunction`, :class:`Term`): conjunction of constraints applied by the rule docstring (str): documentation string Attributes: identifier (str): type identifier affix_type (str): `"prefix"` or `"suffix"` patterns (list): sequence of `(match, replacement)` pairs conjunction (:class:`Conjunction`): type constraints docstring (str): documentation string """ def __init__(self, identifier, affix_type, patterns, conjunction, **kwargs): super().__init__(identifier, conjunction, **kwargs) self.affix_type = affix_type self.patterns = patterns
class _MorphSet: def __init__(self, var, characters): self.var = var self.characters = characters
[docs] class LetterSet(_MorphSet): """ A capturing character class for inflectional lexical rules. LetterSets define a pattern (e.g., `"!a"`) that may match any one of its associated characters. Unlike :class:`WildCard` patterns, LetterSet variables also appear in the replacement pattern of an affixing rule, where they insert the character matched by the corresponding letter set. Args: var (str): variable used in affixing rules (e.g., `"!a"`) characters (str): string or collection of characters that may match an input character Attributes: var (str): letter-set variable characters (str): characters included in the letter-set """ pass
[docs] class WildCard(_MorphSet): """ A non-capturing character class for inflectional lexical rules. WildCards define a pattern (e.g., `"?a"`) that may match any one of its associated characters. Unlike :class:`LetterSet` patterns, WildCard variables may not appear in the replacement pattern of an affixing rule. Args: var (str): variable used in affixing rules (e.g., `"!a"`) characters (str): string or collection of characters that may match an input character Attributes: var (str): wild-card variable characters (str): characters included in the wild-card """ pass
class _Environment: """ TDL environment. """ def __init__(self, entries=None): if entries is None: entries = [] self.entries = entries
[docs] class TypeEnvironment(_Environment): """ TDL type environment. Args: entries (list): TDL entries """
[docs] class InstanceEnvironment(_Environment): """ TDL instance environment. Args: status (str): status (e.g., `"lex-rule"`) entries (list): TDL entries """ def __init__(self, status, entries=None): super().__init__(entries) self.status = status
class ConfigEnvironment(_Environment): """ TDL configuration environment. Args: entries (list): config entries """ def __init__(self, label: str = "", entries=None): super().__init__(entries) self.label = label class ConfigEntry(NamedTuple): """Key-value pair from a TDL config file. Since the type of the value (e.g., an atomic string or a list) depends on the key, values are always read as a list. For convenience, the :prop:`value` property returns these values as a single string joined with space characters. """ key: str values: list[str] @property def value(self) -> str: return " ".join(self.values)
[docs] class FileInclude: """ Include other TDL files in the current environment. Args: value: quoted value of the TDL include statement basedir: directory containing the file with the include statement Attributes: value: The quoted value of TDL include statement. path: The path to the TDL file to include. """ def __init__(self, value: str = "", basedir: util.PathLike = "") -> None: self.value = value self.path = Path(basedir, value).with_suffix(".tdl")
[docs] class LineComment(str): """Single-line comments in TDL."""
[docs] class BlockComment(str): """Multi-line comments in TDL."""