Source code for delphin.predicate

"""
Semantic predicates.
"""

import re

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinException



[docs]
class PredicateError(PyDelphinException):
    """Raised on invalid predicate or predicate operations."""



# allowed parts-of-speech
# 'd' ('discourse') is discouraged and may be removed
_POS = set("nvajrscpqxud")

_lemma_re = re.compile(r"[^\s_]+")
_pos_re = re.compile(r"[{}]".format("".join(_POS)), flags=re.IGNORECASE)
_sense_re = re.compile(r"[^\s_]+")

_LEM_PAT = _lemma_re.pattern
_POS_PAT = _pos_re.pattern
_SNS_PAT = _sense_re.pattern

# strict regular expression only allows fully-compliant predicate strings
_strict_predicate_re = re.compile(
    rf"(_{_LEM_PAT}_{_POS_PAT}(?:_{_SNS_PAT})?)$"  # normalized surface predicate
    r"|([^\s_]\S*)$",  # abstract predicate
    re.IGNORECASE,
)

# robust regular expression allows some observed variations
_robust_predicate_re = re.compile(
    r"_?"  # allow abstract predicates, too
    rf"(?P<lemma>{_LEM_PAT}(?:_{_LEM_PAT})*?)"  # match until last 1 or 2 parts
    rf"(?:_(?P<pos>{_POS_PAT}))?"  # pos is optional
    rf"(?:_(?P<sense>{_SNS_PAT}))?"  # sense is optional
    r"(?:_rel)?$",  # _rel is optional
    flags=re.IGNORECASE,
)


def _strip_predicate(s: str) -> str:
    """Remove quotes and _rel suffix from predicate *s*"""
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    elif s.startswith("'"):
        s = s[1:]
    if s[-4:].lower() == "_rel":
        s = s[:-4]
    return s



[docs]
def split(s: str) -> tuple[str, str | None, str | None]:
    """
    Split predicate string *s* and return the lemma, pos, and sense.

    This function uses more robust pattern matching than used by the
    validation functions :func:`is_valid`, :func:`is_surface`, and
    :func:`is_abstract`. This robustness is to accommodate inputs that
    are not entirely well-formed, such as surface predicates with
    underscores in the lemma or a missing part-of-speech. Additionally
    it can be used, with some discretion, to inspect abstract
    predicates, which technically do not have individual components
    but in practice follow the same convention as surface predicates.

    Examples:
        >>> split("_dog_n_1_rel")
        ('dog', 'n', '1')
        >>> split("udef_q")
        ('udef', 'q', None)
    """
    _s = _strip_predicate(s)
    match = _robust_predicate_re.match(_s)
    if match is None:
        raise PredicateError(f"invalid predicate: {s}")

    return (match.group("lemma"), match.group("pos"), match.group("sense"))




[docs]
def create(lemma: str, pos: str, sense: str | None = None) -> str:
    """
    Create a surface predicate string from its *lemma*, *pos*, and *sense*.

    The components are validated in order to guarantee that the resulting
    predicate symbol is well-formed.

    This function cannot be used to create abstract predicate symbols.

    Examples:
        >>> create("dog", "n", "1")
        '_dog_n_1'
        >>> create("some", "q")
        '_some_q'
    """
    if _lemma_re.fullmatch(lemma) is None:
        raise PredicateError(f"invalid lemma: {lemma}")
    if pos.lower() not in _POS:
        raise PredicateError(f"invalid part-of-speech: {pos}")
    if sense is not None and _sense_re.fullmatch(sense) is None:
        raise PredicateError(f"invalid sense: {sense}")
    parts = [lemma, pos]
    if sense:
        parts.append(sense)
    return "_" + "_".join(parts)




[docs]
def normalize(s: str) -> str:
    """
    Normalize the predicate string *s* to a conventional form.

    This makes predicate strings more consistent by removing quotes and
    the `_rel` suffix, and by lowercasing them.

    Examples:
        >>> normalize('"_DOG_n_1_rel"')
        '_dog_n_1'
        >>> normalize("_dog_n_1")
        '_dog_n_1'
    """
    _s = _strip_predicate(s)
    _s = _s.lower()
    return _s




[docs]
def is_valid(s: str) -> bool:
    """
    Return `True` if *s* is a valid predicate string.

    Examples:
        >>> is_valid('"_dog_n_1_rel"')
        True
        >>> is_valid("_dog_n_1")
        True
        >>> is_valid("_dog_noun_1")
        False
        >>> is_valid("dog_noun_1")
        True
    """
    _s = _strip_predicate(s)
    return _strict_predicate_re.match(_s) is not None




[docs]
def is_surface(s: str) -> bool:
    """
    Return `True` if *s* is a valid surface predicate string.

    Examples:
        >>> is_surface('"_dog_n_1_rel"')
        True
        >>> is_surface("_dog_n_1")
        True
        >>> is_surface("_dog_noun_1")
        False
        >>> is_surface("dog_noun_1")
        False
    """
    _s = _strip_predicate(s)
    m = _strict_predicate_re.match(_s)
    return m is not None and m.lastindex == 1




[docs]
def is_abstract(s: str) -> bool:
    """
    Return `True` if *s* is a valid abstract predicate string.

    Examples:
        >>> is_abstract("udef_q_rel")
        True
        >>> is_abstract('"coord"')
        True
        >>> is_abstract('"_dog_n_1_rel"')
        False
        >>> is_abstract("_dog_n_1")
        False
    """
    _s = _strip_predicate(s)
    m = _strict_predicate_re.match(_s)
    return m is not None and m.lastindex == 2