"""
Serialization for the Indexed MRS format.
"""
from pathlib import Path
from delphin import variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons, MRSSyntaxError
from delphin.util import Lexer
CODEC_INFO = {
"representation": "mrs",
}
##############################################################################
##############################################################################
# Pickle-API methods
[docs]
def load(source, semi):
"""
Deserialize Indexed MRS from a file (handle or filename)
Args:
source (str, file): input filename or file object
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
Returns:
a list of MRS objects
"""
if hasattr(source, "read"):
ms = list(_decode(source, semi))
else:
source = Path(source).expanduser()
with source.open() as fh:
ms = list(_decode(fh, semi))
return ms
[docs]
def loads(s, semi, single=False, encoding="utf-8"):
"""
Deserialize Indexed MRS string representations
Args:
s (str): an Indexed MRS string
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
Returns:
a list of MRS objects
"""
ms = list(_decode(s.splitlines(), semi))
return ms
[docs]
def dump(
ms, destination, semi, properties=True, lnk=True, indent=False, encoding="utf-8"
):
"""
Serialize MRS objects to Indexed MRS and write to a file
Args:
ms: an iterator of MRS objects to serialize
destination: filename or file object where data will be written
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
properties: if `False`, suppress morphosemantic properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
encoding (str): if *destination* is a filename, write to the
file with the given encoding; otherwise it is ignored
"""
text = dumps(ms, semi, properties=properties, lnk=lnk, indent=indent)
if hasattr(destination, "write"):
print(text, file=destination)
else:
destination = Path(destination).expanduser()
with destination.open("w", encoding=encoding) as fh:
print(text, file=fh)
[docs]
def dumps(ms, semi, properties=True, lnk=True, indent=False):
"""
Serialize MRS objects to an Indexed MRS representation
Args:
ms: an iterator of MRS objects to serialize
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
properties: if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an Indexed MRS string representation of a corpus of MRS objects
"""
return _encode(ms, semi, properties, lnk, indent)
[docs]
def decode(s, semi):
"""
Deserialize a MRS object from an Indexed MRS string.
Args:
s (str): an Indexed MRS string
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
"""
lexer = _IndexedMRSLexer.lex(s.splitlines())
return _decode_indexed(lexer, semi)
[docs]
def encode(d, semi, properties=True, lnk=True, indent=False):
"""
Serialize a MRS object to an Indexed MRS string.
Args:
d: a MRS object
semi (:class:`SemI`): the semantic interface for the grammar
that produced the MRS
properties (bool): if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an Indexed MRS-serialization of the MRS object
"""
return _encode_indexed(d, semi, properties, lnk, indent)
##############################################################################
##############################################################################
# Decoding
_IndexedMRSLexer = Lexer(
tokens=[
(r"<-?\d+:-?\d+>", "LNK:a lnk value"),
(r'"([^"\\]*(?:\\.[^"\\]*)*)"', "DQSTRING:a string"),
(r"<", "LANGLE:<"),
(r">", "RANGLE:>"),
(r"\{", "LBRACE:{"),
(r"\}", "RBRACE:}"),
(r"\(", "LPAREN:("),
(r"\)", "RPAREN:)"),
(r",", "COMMA:,"),
(r":", "COLON::"),
(r'[^\s"\'()\/,:;<=>[\]{}]+', "SYMBOL:a symbol"),
(r"[^\s]", "UNEXPECTED"),
],
error_class=MRSSyntaxError,
)
LNK = _IndexedMRSLexer.tokentypes.LNK
DQSTRING = _IndexedMRSLexer.tokentypes.DQSTRING
LANGLE = _IndexedMRSLexer.tokentypes.LANGLE
RANGLE = _IndexedMRSLexer.tokentypes.RANGLE
LBRACE = _IndexedMRSLexer.tokentypes.LBRACE
RBRACE = _IndexedMRSLexer.tokentypes.RBRACE
LPAREN = _IndexedMRSLexer.tokentypes.LPAREN
RPAREN = _IndexedMRSLexer.tokentypes.RPAREN
COMMA = _IndexedMRSLexer.tokentypes.COMMA
COLON = _IndexedMRSLexer.tokentypes.COLON
SYMBOL = _IndexedMRSLexer.tokentypes.SYMBOL
def _decode(lineiter, semi):
lexer = _IndexedMRSLexer.lex(lineiter)
try:
while lexer.peek():
yield _decode_indexed(lexer, semi)
except StopIteration:
pass
def _decode_indexed(lexer, semi):
icons = lnk = surface = identifier = None
variables = {}
lexer.expect_type(LANGLE)
top, _, index = lexer.expect_type(SYMBOL, COMMA, SYMBOL)
if lexer.accept_type(COLON):
variables[index] = _decode_proplist(lexer)
lexer.expect_type(COMMA)
rels = _decode_rels(lexer, variables, semi)
hcons = _decode_cons(lexer, HCons)
if lexer.accept_type(COMMA):
icons = _decode_cons(lexer, ICons)
lexer.expect_type(RANGLE)
_match_properties(variables, semi)
return MRS(
top=top,
index=index,
rels=rels,
hcons=hcons,
icons=icons,
variables=variables,
lnk=lnk,
surface=surface,
identifier=identifier,
)
def _decode_proplist(lexer):
proplist = [lexer.expect_type(SYMBOL)]
while lexer.accept_type(COLON):
propval = lexer.expect_type(SYMBOL)
proplist.append(propval)
return proplist
def _decode_rels(lexer, variables, semi):
rels = []
lexer.expect_type(LBRACE)
if lexer.peek()[0] != RBRACE:
while True:
rels.append(_decode_rel(lexer, variables, semi))
if not lexer.accept_type(COMMA):
break
lexer.expect_type(RBRACE, COMMA)
return rels
def _decode_rel(lexer, variables, semi):
label, _, pred = lexer.expect_type(SYMBOL, COLON, SYMBOL)
lnk = _decode_lnk(lexer)
arglist, carg = _decode_arglist(lexer, variables)
argtypes = [variable.type(arg) for arg in arglist]
synopsis = semi.find_synopsis(pred, argtypes)
args = {d[0]: v for d, v in zip(synopsis, arglist, strict=True)}
if carg:
args[CONSTANT_ROLE] = carg
return EP(pred, label, args=args, lnk=lnk, surface=None, base=None)
def _decode_lnk(lexer):
lnk = lexer.accept_type(LNK)
if lnk is not None:
lnk = Lnk(lnk)
return lnk
def _decode_arglist(lexer, variables):
arglist = []
carg = None
lexer.expect_type(LPAREN)
if lexer.peek()[0] != RPAREN:
while True:
gid, arg = lexer.choice_type(SYMBOL, DQSTRING)
if gid == SYMBOL:
if lexer.accept_type(COLON):
variables[arg] = _decode_proplist(lexer)
arglist.append(arg)
else:
carg = arg
if not lexer.accept_type(COMMA):
break
lexer.expect_type(RPAREN)
return arglist, carg
def _decode_cons(lexer, cls):
cons = []
lexer.expect_type(LBRACE)
if lexer.peek()[0] != RBRACE:
while True:
lhs, reln, rhs = lexer.expect_type(SYMBOL, SYMBOL, SYMBOL)
cons.append(cls(lhs, reln, rhs))
if not lexer.accept_type(COMMA):
break
lexer.expect_type(RBRACE)
return cons
def _match_properties(variables, semi):
for var, propvals in variables.items():
if not propvals:
continue
semiprops = semi.variables[variable.type(var)]
assert len(semiprops) == len(propvals)
assert all(
semi.properties.subsumes(sp[1], pv)
for sp, pv in zip(semiprops, propvals, strict=True)
)
variables[var] = {sp[0]: pv for sp, pv in zip(semiprops, propvals, strict=True)}
##############################################################################
##############################################################################
# Encoding
def _encode(ms, semi, properties, lnk, indent):
if indent is None or indent is False:
delim = " "
else:
delim = "\n"
return delim.join(_encode_indexed(m, semi, properties, lnk, indent) for m in ms)
def _encode_indexed(m, semi, properties, lnk, indent):
if indent is None or indent is False:
i1 = ",{{{}}}"
i2 = i3 = ","
start = "<"
end = ">"
hook = "{},{}"
else:
if indent is True:
indent = 2
i1 = ",\n" + (" " * indent) + "{{" + (" " * (indent - 1)) + "{} }}"
i2 = ",\n" + (" " * indent)
i3 = ", "
start = "< "
end = " >"
hook = "{}, {}"
if properties:
varprops = _prepare_variable_properties(m, semi)
else:
varprops = {}
body = [
hook.format(m.top, _encode_variable(m.index, varprops)),
i1.format(i2.join(_encode_rel(ep, semi, varprops, lnk, i3) for ep in m.rels)),
i1.format(i2.join(_encode_hcons(hc) for hc in m.hcons)),
]
if m.icons:
body.append(i1.format(i2.join(_encode_icons(ic) for ic in m.icons)))
return start + "".join(body) + end
def _prepare_variable_properties(m, semi):
proplists = {}
for var, varprops in m.variables.items():
if varprops:
proplists[var] = [
varprops.get(key, val).upper()
for key, val in semi.variables[variable.type(var)]
]
return proplists
def _encode_variable(var, varprops):
if var in varprops:
props = ":" + ":".join(varprops[var])
del varprops[var]
else:
props = ""
return var + props
def _encode_rel(ep, semi, varprops, lnk, delim):
roles = {role: None for role in ep.args if role != CONSTANT_ROLE}
synopsis = semi.find_synopsis(ep.predicate, roles)
args = [
_encode_variable(ep.args[d.name], varprops)
for d in synopsis
if d.name in ep.args
]
if ep.carg is not None:
args.append(f'"{ep.carg}"')
return "{label}:{pred}{lnk}({args})".format(
label=ep.label,
pred=ep.predicate,
lnk=str(ep.lnk) if lnk else "",
args=delim.join(args),
)
def _encode_hcons(hc):
return f"{hc.hi} {hc.relation} {hc.lo}"
def _encode_icons(ic):
return f"{ic.left} {ic.relation} {ic.right}"