Source code for delphin.predicate
"""
Semantic predicates.
"""
import re
# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__ # noqa: F401
from delphin.exceptions import PyDelphinException
[docs]
class PredicateError(PyDelphinException):
"""Raised on invalid predicate or predicate operations."""
# allowed parts-of-speech
# 'd' ('discourse') is discouraged and may be removed
_POS = set("nvajrscpqxud")
_lemma_re = re.compile(r"[^\s_]+")
_pos_re = re.compile(r"[{}]".format("".join(_POS)), flags=re.IGNORECASE)
_sense_re = re.compile(r"[^\s_]+")
_LEM_PAT = _lemma_re.pattern
_POS_PAT = _pos_re.pattern
_SNS_PAT = _sense_re.pattern
# strict regular expression only allows fully-compliant predicate strings
_strict_predicate_re = re.compile(
rf"(_{_LEM_PAT}_{_POS_PAT}(?:_{_SNS_PAT})?)$" # normalized surface predicate
r"|([^\s_]\S*)$", # abstract predicate
re.IGNORECASE,
)
# robust regular expression allows some observed variations
_robust_predicate_re = re.compile(
r"_?" # allow abstract predicates, too
rf"(?P<lemma>{_LEM_PAT}(?:_{_LEM_PAT})*?)" # match until last 1 or 2 parts
rf"(?:_(?P<pos>{_POS_PAT}))?" # pos is optional
rf"(?:_(?P<sense>{_SNS_PAT}))?" # sense is optional
r"(?:_rel)?$", # _rel is optional
flags=re.IGNORECASE,
)
def _strip_predicate(s: str) -> str:
"""Remove quotes and _rel suffix from predicate *s*"""
if s.startswith('"') and s.endswith('"'):
s = s[1:-1]
elif s.startswith("'"):
s = s[1:]
if s[-4:].lower() == "_rel":
s = s[:-4]
return s
[docs]
def split(s: str) -> tuple[str, str | None, str | None]:
"""
Split predicate string *s* and return the lemma, pos, and sense.
This function uses more robust pattern matching than used by the
validation functions :func:`is_valid`, :func:`is_surface`, and
:func:`is_abstract`. This robustness is to accommodate inputs that
are not entirely well-formed, such as surface predicates with
underscores in the lemma or a missing part-of-speech. Additionally
it can be used, with some discretion, to inspect abstract
predicates, which technically do not have individual components
but in practice follow the same convention as surface predicates.
Examples:
>>> split("_dog_n_1_rel")
('dog', 'n', '1')
>>> split("udef_q")
('udef', 'q', None)
"""
_s = _strip_predicate(s)
match = _robust_predicate_re.match(_s)
if match is None:
raise PredicateError(f"invalid predicate: {s}")
return (match.group("lemma"), match.group("pos"), match.group("sense"))
[docs]
def create(lemma: str, pos: str, sense: str | None = None) -> str:
"""
Create a surface predicate string from its *lemma*, *pos*, and *sense*.
The components are validated in order to guarantee that the resulting
predicate symbol is well-formed.
This function cannot be used to create abstract predicate symbols.
Examples:
>>> create("dog", "n", "1")
'_dog_n_1'
>>> create("some", "q")
'_some_q'
"""
if _lemma_re.fullmatch(lemma) is None:
raise PredicateError(f"invalid lemma: {lemma}")
if pos.lower() not in _POS:
raise PredicateError(f"invalid part-of-speech: {pos}")
if sense is not None and _sense_re.fullmatch(sense) is None:
raise PredicateError(f"invalid sense: {sense}")
parts = [lemma, pos]
if sense:
parts.append(sense)
return "_" + "_".join(parts)
[docs]
def normalize(s: str) -> str:
"""
Normalize the predicate string *s* to a conventional form.
This makes predicate strings more consistent by removing quotes and
the `_rel` suffix, and by lowercasing them.
Examples:
>>> normalize('"_DOG_n_1_rel"')
'_dog_n_1'
>>> normalize("_dog_n_1")
'_dog_n_1'
"""
_s = _strip_predicate(s)
_s = _s.lower()
return _s
[docs]
def is_valid(s: str) -> bool:
"""
Return `True` if *s* is a valid predicate string.
Examples:
>>> is_valid('"_dog_n_1_rel"')
True
>>> is_valid("_dog_n_1")
True
>>> is_valid("_dog_noun_1")
False
>>> is_valid("dog_noun_1")
True
"""
_s = _strip_predicate(s)
return _strict_predicate_re.match(_s) is not None
[docs]
def is_surface(s: str) -> bool:
"""
Return `True` if *s* is a valid surface predicate string.
Examples:
>>> is_surface('"_dog_n_1_rel"')
True
>>> is_surface("_dog_n_1")
True
>>> is_surface("_dog_noun_1")
False
>>> is_surface("dog_noun_1")
False
"""
_s = _strip_predicate(s)
m = _strict_predicate_re.match(_s)
return m is not None and m.lastindex == 1
[docs]
def is_abstract(s: str) -> bool:
"""
Return `True` if *s* is a valid abstract predicate string.
Examples:
>>> is_abstract("udef_q_rel")
True
>>> is_abstract('"coord"')
True
>>> is_abstract('"_dog_n_1_rel"')
False
>>> is_abstract("_dog_n_1")
False
"""
_s = _strip_predicate(s)
m = _strict_predicate_re.match(_s)
return m is not None and m.lastindex == 2