Source code for delphin.predicate

"""
Semantic predicates.
"""

import re

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinException


[docs] class PredicateError(PyDelphinException): """Raised on invalid predicate or predicate operations."""
# allowed parts-of-speech # 'd' ('discourse') is discouraged and may be removed _POS = set("nvajrscpqxud") _lemma_re = re.compile(r"[^\s_]+") _pos_re = re.compile(r"[{}]".format("".join(_POS)), flags=re.IGNORECASE) _sense_re = re.compile(r"[^\s_]+") _LEM_PAT = _lemma_re.pattern _POS_PAT = _pos_re.pattern _SNS_PAT = _sense_re.pattern # strict regular expression only allows fully-compliant predicate strings _strict_predicate_re = re.compile( rf"(_{_LEM_PAT}_{_POS_PAT}(?:_{_SNS_PAT})?)$" # normalized surface predicate r"|([^\s_]\S*)$", # abstract predicate re.IGNORECASE, ) # robust regular expression allows some observed variations _robust_predicate_re = re.compile( r"_?" # allow abstract predicates, too rf"(?P<lemma>{_LEM_PAT}(?:_{_LEM_PAT})*?)" # match until last 1 or 2 parts rf"(?:_(?P<pos>{_POS_PAT}))?" # pos is optional rf"(?:_(?P<sense>{_SNS_PAT}))?" # sense is optional r"(?:_rel)?$", # _rel is optional flags=re.IGNORECASE, ) def _strip_predicate(s: str) -> str: """Remove quotes and _rel suffix from predicate *s*""" if s.startswith('"') and s.endswith('"'): s = s[1:-1] elif s.startswith("'"): s = s[1:] if s[-4:].lower() == "_rel": s = s[:-4] return s
[docs] def split(s: str) -> tuple[str, str | None, str | None]: """ Split predicate string *s* and return the lemma, pos, and sense. This function uses more robust pattern matching than used by the validation functions :func:`is_valid`, :func:`is_surface`, and :func:`is_abstract`. This robustness is to accommodate inputs that are not entirely well-formed, such as surface predicates with underscores in the lemma or a missing part-of-speech. Additionally it can be used, with some discretion, to inspect abstract predicates, which technically do not have individual components but in practice follow the same convention as surface predicates. Examples: >>> split("_dog_n_1_rel") ('dog', 'n', '1') >>> split("udef_q") ('udef', 'q', None) """ _s = _strip_predicate(s) match = _robust_predicate_re.match(_s) if match is None: raise PredicateError(f"invalid predicate: {s}") return (match.group("lemma"), match.group("pos"), match.group("sense"))
[docs] def create(lemma: str, pos: str, sense: str | None = None) -> str: """ Create a surface predicate string from its *lemma*, *pos*, and *sense*. The components are validated in order to guarantee that the resulting predicate symbol is well-formed. This function cannot be used to create abstract predicate symbols. Examples: >>> create("dog", "n", "1") '_dog_n_1' >>> create("some", "q") '_some_q' """ if _lemma_re.fullmatch(lemma) is None: raise PredicateError(f"invalid lemma: {lemma}") if pos.lower() not in _POS: raise PredicateError(f"invalid part-of-speech: {pos}") if sense is not None and _sense_re.fullmatch(sense) is None: raise PredicateError(f"invalid sense: {sense}") parts = [lemma, pos] if sense: parts.append(sense) return "_" + "_".join(parts)
[docs] def normalize(s: str) -> str: """ Normalize the predicate string *s* to a conventional form. This makes predicate strings more consistent by removing quotes and the `_rel` suffix, and by lowercasing them. Examples: >>> normalize('"_DOG_n_1_rel"') '_dog_n_1' >>> normalize("_dog_n_1") '_dog_n_1' """ _s = _strip_predicate(s) _s = _s.lower() return _s
[docs] def is_valid(s: str) -> bool: """ Return `True` if *s* is a valid predicate string. Examples: >>> is_valid('"_dog_n_1_rel"') True >>> is_valid("_dog_n_1") True >>> is_valid("_dog_noun_1") False >>> is_valid("dog_noun_1") True """ _s = _strip_predicate(s) return _strict_predicate_re.match(_s) is not None
[docs] def is_surface(s: str) -> bool: """ Return `True` if *s* is a valid surface predicate string. Examples: >>> is_surface('"_dog_n_1_rel"') True >>> is_surface("_dog_n_1") True >>> is_surface("_dog_noun_1") False >>> is_surface("dog_noun_1") False """ _s = _strip_predicate(s) m = _strict_predicate_re.match(_s) return m is not None and m.lastindex == 1
[docs] def is_abstract(s: str) -> bool: """ Return `True` if *s* is a valid abstract predicate string. Examples: >>> is_abstract("udef_q_rel") True >>> is_abstract('"coord"') True >>> is_abstract('"_dog_n_1_rel"') False >>> is_abstract("_dog_n_1") False """ _s = _strip_predicate(s) m = _strict_predicate_re.match(_s) return m is not None and m.lastindex == 2