"""
Semantic predicates.
"""
import re
from typing import Optional, Tuple
# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__ # noqa: F401
from delphin.exceptions import PyDelphinException
[docs]
class PredicateError(PyDelphinException):
"""Raised on invalid predicate or predicate operations."""
# allowed parts-of-speech
# 'd' ('discourse') is discouraged and may be removed
_POS = set('nvajrscpqxud')
_lemma_re = re.compile(r'[^\s_]+')
_pos_re = re.compile(r'[{}]'.format(''.join(_POS)), flags=re.IGNORECASE)
_sense_re = re.compile(r'[^\s_]+')
# strict regular expression only allows fully-compliant predicate strings
_strict_predicate_re = re.compile(
r'(_{0}_{1}(?:_{2})?)$' # normalized surface predicate
r'|([^\s_]\S*)$' # abstract predicate
.format(_lemma_re.pattern, _pos_re.pattern, _sense_re.pattern),
re.IGNORECASE)
# robust regular expression allows some observed variations
_robust_predicate_re = re.compile(
r'_?' # allow abstract predicates, too
r'(?P<lemma>{0}(?:_{0})*?)' # match until last 1 or 2 parts
r'(?:_(?P<pos>{1}))?' # pos is optional
r'(?:_(?P<sense>{2}))?' # sense is optional
r'(?:_rel)?$' # _rel is optional
.format(_lemma_re.pattern, _pos_re.pattern, _sense_re.pattern),
flags=re.IGNORECASE)
def _strip_predicate(s: str) -> str:
"""Remove quotes and _rel suffix from predicate *s*"""
if s.startswith('"') and s.endswith('"'):
s = s[1:-1]
elif s.startswith("'"):
s = s[1:]
if s[-4:].lower() == '_rel':
s = s[:-4]
return s
[docs]
def split(s: str) -> Tuple[str, Optional[str], Optional[str]]:
"""
Split predicate string *s* and return the lemma, pos, and sense.
This function uses more robust pattern matching than used by the
validation functions :func:`is_valid`, :func:`is_surface`, and
:func:`is_abstract`. This robustness is to accommodate inputs that
are not entirely well-formed, such as surface predicates with
underscores in the lemma or a missing part-of-speech. Additionally
it can be used, with some discretion, to inspect abstract
predicates, which technically do not have individual components
but in practice follow the same convention as surface predicates.
Examples:
>>> split('_dog_n_1_rel')
('dog', 'n', '1')
>>> split('udef_q')
('udef', 'q', None)
"""
_s = _strip_predicate(s)
match = _robust_predicate_re.match(_s)
if match is None:
raise PredicateError(f'invalid predicate: {s}')
return (match.group('lemma'), match.group('pos'), match.group('sense'))
[docs]
def create(lemma: str, pos: str, sense: Optional[str] = None) -> str:
"""
Create a surface predicate string from its *lemma*, *pos*, and *sense*.
The components are validated in order to guarantee that the resulting
predicate symbol is well-formed.
This function cannot be used to create abstract predicate symbols.
Examples:
>>> create('dog', 'n', '1')
'_dog_n_1'
>>> create('some', 'q')
'_some_q'
"""
if _lemma_re.fullmatch(lemma) is None:
raise PredicateError(f'invalid lemma: {lemma}')
if pos.lower() not in _POS:
raise PredicateError(f'invalid part-of-speech: {pos}')
if sense is not None and _sense_re.fullmatch(sense) is None:
raise PredicateError(f'invalid sense: {sense}')
parts = [lemma, pos]
if sense:
parts.append(sense)
return '_' + '_'.join(parts)
[docs]
def normalize(s: str) -> str:
"""
Normalize the predicate string *s* to a conventional form.
This makes predicate strings more consistent by removing quotes and
the `_rel` suffix, and by lowercasing them.
Examples:
>>> normalize('"_DOG_n_1_rel"')
'_dog_n_1'
>>> normalize('_dog_n_1')
'_dog_n_1'
"""
_s = _strip_predicate(s)
_s = _s.lower()
return _s
[docs]
def is_valid(s: str) -> bool:
"""
Return `True` if *s* is a valid predicate string.
Examples:
>>> is_valid('"_dog_n_1_rel"')
True
>>> is_valid('_dog_n_1')
True
>>> is_valid('_dog_noun_1')
False
>>> is_valid('dog_noun_1')
True
"""
_s = _strip_predicate(s)
return _strict_predicate_re.match(_s) is not None
[docs]
def is_surface(s: str) -> bool:
"""
Return `True` if *s* is a valid surface predicate string.
Examples:
>>> is_surface('"_dog_n_1_rel"')
True
>>> is_surface('_dog_n_1')
True
>>> is_surface('_dog_noun_1')
False
>>> is_surface('dog_noun_1')
False
"""
_s = _strip_predicate(s)
m = _strict_predicate_re.match(_s)
return m is not None and m.lastindex == 1
[docs]
def is_abstract(s: str) -> bool:
"""
Return `True` if *s* is a valid abstract predicate string.
Examples:
>>> is_abstract('udef_q_rel')
True
>>> is_abstract('"coord"')
True
>>> is_abstract('"_dog_n_1_rel"')
False
>>> is_abstract('_dog_n_1')
False
"""
_s = _strip_predicate(s)
m = _strict_predicate_re.match(_s)
return m is not None and m.lastindex == 2