Source code for delphin.tokens

"""
YY tokens and token lattices.
"""

import re
from typing import NamedTuple

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.lnk import Lnk


class _YYToken(NamedTuple):
    id: int
    start: int
    end: int
    lnk: Lnk
    paths: list[int]
    form: str
    surface: str | None
    ipos: int  # relative index of sub-token to which lrules apply
    lrules: list[str]  # list of morphological rules applied, "null" if None
    pos: list[tuple[str, float]]


[docs] class YYToken(_YYToken): """ A tuple of token data in the YY format. Args: id: token identifier start: start vertex end: end vertex lnk: <from:to> charspan (optional) paths: path membership form: surface token surface: original token (optional; only if `form` was modified) ipos: length of `lrules`? always 0? lrules: something about lexical rules; always "null"? pos: pairs of (POS, prob) """ __slots__ = () def __new__( cls, id, start, end, lnk=None, paths=(1,), form=None, surface=None, ipos=0, lrules=("null",), pos=(), ): if form is None: raise TypeError("Missing required keyword argument 'form'.") if lnk is None: lnk = Lnk.default() return super().__new__( cls, id, start, end, lnk, list(paths), form, surface, ipos, list(lrules), list(pos), ) def __str__(self): parts = [str(self.id), str(self.start), str(self.end)] if self.lnk: parts.append(str(self.lnk)) parts.append(" ".join(map(str, self.paths or [1]))) if self.surface is None: parts.append(f'"{self.form}"') else: parts.append(f'"{self.form}" "{self.surface}"') parts.extend([str(self.ipos), " ".join(map('"{}"'.format, self.lrules))]) if self.pos: ps = [f'"{pos}" {p:.4f}' for pos, p in self.pos] parts.append(" ".join(ps)) return "({})".format(", ".join(parts))
[docs] @classmethod def from_dict(cls, d): """ Decode from a dictionary as from :meth:`to_dict`. """ return cls( d["id"], d["start"], d["end"], Lnk.charspan(d["from"], d["to"]) if "from" in d else None, form=d["form"], surface=d.get("surface"), # ipos= # lrules= pos=list(zip(d.get("tags", []), d.get("probabilities", []), strict=True)), )
[docs] def to_dict(self): """ Encode the token as a dictionary suitable for JSON serialization. """ d = {"id": self.id, "start": self.start, "end": self.end, "form": self.form} if self.lnk: cfrom, cto = self.lnk.data d["from"] = cfrom d["to"] = cto if self.surface is not None: d["surface"] = self.surface if self.pos: d["tags"] = [ps[0] for ps in self.pos] d["probabilities"] = [ps[1] for ps in self.pos] return d
# from: https://github.com/delph-in/docs/wiki/PetInput # (id, start, end, [link,] path+, form [surface], ipos, lrule+[, {pos p}+]) _yy_re = re.compile( r"\(\s*" r"(?P<id>{integer}){comma}" r"(?P<start>{integer}){comma}" r"(?P<end>{integer}){comma}" r"(?:<(?P<lnkfrom>{integer}):(?P<lnkto>{integer})>{comma})?" r"(?P<paths>(?:{integer}\s*)+){comma}" r"(?P<form>{string})" r"(?:\s*(?P<surface>{string}))?" r"{comma}" r"(?P<ipos>{integer}){comma}" r"(?P<lrules>(?:{string}\s*)+)" r"(?:{comma}(?P<pos>(?:{string}\s+{float}\s*)+))?" r"\s*\)".format( integer=r"-?\d+", comma=r"\s*,\s*", string=r'"[^"\\]*(?:\\.[^"\\]*)*"', float=r"-?(0|[1-9]\d*)(\.\d+[eE][-+]?|\.|[eE][-+]?)\d+", ) )
[docs] class YYTokenLattice: """ A lattice of YY Tokens. Args: tokens: a list of YYToken objects """ def __init__(self, tokens): self.tokens = tokens
[docs] @classmethod def from_string(cls, s): """ Decode from the YY token lattice format. """ def _qstrip(s): return s[1:-1] # remove assumed quote characters tokens = [] for match in _yy_re.finditer(s): d = match.groupdict() lnk, pos = None, [] if d["lnkfrom"] is not None: lnk = Lnk.charspan(d["lnkfrom"], d["lnkto"]) if d["pos"] is not None: ps = d["pos"].strip().split() pos = list( zip( map(_qstrip, ps[::2]), map(float, ps[1::2]), strict=True, ) ) tokens.append( YYToken( int(d["id"]), int(d["start"]), int(d["end"]), lnk, list(map(int, d["paths"].strip().split())), _qstrip(d["form"]), None if d["surface"] is None else _qstrip(d["surface"]), int(d["ipos"]), list(map(_qstrip, d["lrules"].strip().split())), pos, ) ) return cls(tokens)
[docs] @classmethod def from_list(cls, toks): """ Decode from a list as from :meth:`to_list`. """ return cls(list(map(YYToken.from_dict, toks)))
[docs] def to_list(self): """ Encode the token lattice as a list suitable for JSON serialization. """ return [t.to_dict() for t in self.tokens]
def __str__(self): return " ".join(map(str, self.tokens)) def __eq__(self, other): if not isinstance(other, YYTokenLattice): return NotImplemented if len(self.tokens) == len(other.tokens) and all( t1 == t2 for t1, t2 in zip(self.tokens, other.tokens, strict=True) ): return True return False