Source code for delphin.codecs.indexedmrs

"""
Serialization for the Indexed MRS format.
"""

from pathlib import Path

from delphin import variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons, MRSSyntaxError
from delphin.util import Lexer

CODEC_INFO = {
    "representation": "mrs",
}


##############################################################################
##############################################################################
# Pickle-API methods


[docs] def load(source, semi): """ Deserialize Indexed MRS from a file (handle or filename) Args: source (str, file): input filename or file object semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS Returns: a list of MRS objects """ if hasattr(source, "read"): ms = list(_decode(source, semi)) else: source = Path(source).expanduser() with source.open() as fh: ms = list(_decode(fh, semi)) return ms
[docs] def loads(s, semi, single=False, encoding="utf-8"): """ Deserialize Indexed MRS string representations Args: s (str): an Indexed MRS string semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS Returns: a list of MRS objects """ ms = list(_decode(s.splitlines(), semi)) return ms
[docs] def dump( ms, destination, semi, properties=True, lnk=True, indent=False, encoding="utf-8" ): """ Serialize MRS objects to Indexed MRS and write to a file Args: ms: an iterator of MRS objects to serialize destination: filename or file object where data will be written semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties: if `False`, suppress morphosemantic properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation encoding (str): if *destination* is a filename, write to the file with the given encoding; otherwise it is ignored """ text = dumps(ms, semi, properties=properties, lnk=lnk, indent=indent) if hasattr(destination, "write"): print(text, file=destination) else: destination = Path(destination).expanduser() with destination.open("w", encoding=encoding) as fh: print(text, file=fh)
[docs] def dumps(ms, semi, properties=True, lnk=True, indent=False): """ Serialize MRS objects to an Indexed MRS representation Args: ms: an iterator of MRS objects to serialize semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties: if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an Indexed MRS string representation of a corpus of MRS objects """ return _encode(ms, semi, properties, lnk, indent)
[docs] def decode(s, semi): """ Deserialize a MRS object from an Indexed MRS string. Args: s (str): an Indexed MRS string semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS """ lexer = _IndexedMRSLexer.lex(s.splitlines()) return _decode_indexed(lexer, semi)
[docs] def encode(d, semi, properties=True, lnk=True, indent=False): """ Serialize a MRS object to an Indexed MRS string. Args: d: a MRS object semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an Indexed MRS-serialization of the MRS object """ return _encode_indexed(d, semi, properties, lnk, indent)
############################################################################## ############################################################################## # Decoding _IndexedMRSLexer = Lexer( tokens=[ (r"<-?\d+:-?\d+>", "LNK:a lnk value"), (r'"([^"\\]*(?:\\.[^"\\]*)*)"', "DQSTRING:a string"), (r"<", "LANGLE:<"), (r">", "RANGLE:>"), (r"\{", "LBRACE:{"), (r"\}", "RBRACE:}"), (r"\(", "LPAREN:("), (r"\)", "RPAREN:)"), (r",", "COMMA:,"), (r":", "COLON::"), (r'[^\s"\'()\/,:;<=>[\]{}]+', "SYMBOL:a symbol"), (r"[^\s]", "UNEXPECTED"), ], error_class=MRSSyntaxError, ) LNK = _IndexedMRSLexer.tokentypes.LNK DQSTRING = _IndexedMRSLexer.tokentypes.DQSTRING LANGLE = _IndexedMRSLexer.tokentypes.LANGLE RANGLE = _IndexedMRSLexer.tokentypes.RANGLE LBRACE = _IndexedMRSLexer.tokentypes.LBRACE RBRACE = _IndexedMRSLexer.tokentypes.RBRACE LPAREN = _IndexedMRSLexer.tokentypes.LPAREN RPAREN = _IndexedMRSLexer.tokentypes.RPAREN COMMA = _IndexedMRSLexer.tokentypes.COMMA COLON = _IndexedMRSLexer.tokentypes.COLON SYMBOL = _IndexedMRSLexer.tokentypes.SYMBOL def _decode(lineiter, semi): lexer = _IndexedMRSLexer.lex(lineiter) try: while lexer.peek(): yield _decode_indexed(lexer, semi) except StopIteration: pass def _decode_indexed(lexer, semi): icons = lnk = surface = identifier = None variables = {} lexer.expect_type(LANGLE) top, _, index = lexer.expect_type(SYMBOL, COMMA, SYMBOL) if lexer.accept_type(COLON): variables[index] = _decode_proplist(lexer) lexer.expect_type(COMMA) rels = _decode_rels(lexer, variables, semi) hcons = _decode_cons(lexer, HCons) if lexer.accept_type(COMMA): icons = _decode_cons(lexer, ICons) lexer.expect_type(RANGLE) _match_properties(variables, semi) return MRS( top=top, index=index, rels=rels, hcons=hcons, icons=icons, variables=variables, lnk=lnk, surface=surface, identifier=identifier, ) def _decode_proplist(lexer): proplist = [lexer.expect_type(SYMBOL)] while lexer.accept_type(COLON): propval = lexer.expect_type(SYMBOL) proplist.append(propval) return proplist def _decode_rels(lexer, variables, semi): rels = [] lexer.expect_type(LBRACE) if lexer.peek()[0] != RBRACE: while True: rels.append(_decode_rel(lexer, variables, semi)) if not lexer.accept_type(COMMA): break lexer.expect_type(RBRACE, COMMA) return rels def _decode_rel(lexer, variables, semi): label, _, pred = lexer.expect_type(SYMBOL, COLON, SYMBOL) lnk = _decode_lnk(lexer) arglist, carg = _decode_arglist(lexer, variables) argtypes = [variable.type(arg) for arg in arglist] synopsis = semi.find_synopsis(pred, argtypes) args = {d[0]: v for d, v in zip(synopsis, arglist, strict=True)} if carg: args[CONSTANT_ROLE] = carg return EP(pred, label, args=args, lnk=lnk, surface=None, base=None) def _decode_lnk(lexer): lnk = lexer.accept_type(LNK) if lnk is not None: lnk = Lnk(lnk) return lnk def _decode_arglist(lexer, variables): arglist = [] carg = None lexer.expect_type(LPAREN) if lexer.peek()[0] != RPAREN: while True: gid, arg = lexer.choice_type(SYMBOL, DQSTRING) if gid == SYMBOL: if lexer.accept_type(COLON): variables[arg] = _decode_proplist(lexer) arglist.append(arg) else: carg = arg if not lexer.accept_type(COMMA): break lexer.expect_type(RPAREN) return arglist, carg def _decode_cons(lexer, cls): cons = [] lexer.expect_type(LBRACE) if lexer.peek()[0] != RBRACE: while True: lhs, reln, rhs = lexer.expect_type(SYMBOL, SYMBOL, SYMBOL) cons.append(cls(lhs, reln, rhs)) if not lexer.accept_type(COMMA): break lexer.expect_type(RBRACE) return cons def _match_properties(variables, semi): for var, propvals in variables.items(): if not propvals: continue semiprops = semi.variables[variable.type(var)] assert len(semiprops) == len(propvals) assert all( semi.properties.subsumes(sp[1], pv) for sp, pv in zip(semiprops, propvals, strict=True) ) variables[var] = {sp[0]: pv for sp, pv in zip(semiprops, propvals, strict=True)} ############################################################################## ############################################################################## # Encoding def _encode(ms, semi, properties, lnk, indent): if indent is None or indent is False: delim = " " else: delim = "\n" return delim.join(_encode_indexed(m, semi, properties, lnk, indent) for m in ms) def _encode_indexed(m, semi, properties, lnk, indent): if indent is None or indent is False: i1 = ",{{{}}}" i2 = i3 = "," start = "<" end = ">" hook = "{},{}" else: if indent is True: indent = 2 i1 = ",\n" + (" " * indent) + "{{" + (" " * (indent - 1)) + "{} }}" i2 = ",\n" + (" " * indent) i3 = ", " start = "< " end = " >" hook = "{}, {}" if properties: varprops = _prepare_variable_properties(m, semi) else: varprops = {} body = [ hook.format(m.top, _encode_variable(m.index, varprops)), i1.format(i2.join(_encode_rel(ep, semi, varprops, lnk, i3) for ep in m.rels)), i1.format(i2.join(_encode_hcons(hc) for hc in m.hcons)), ] if m.icons: body.append(i1.format(i2.join(_encode_icons(ic) for ic in m.icons))) return start + "".join(body) + end def _prepare_variable_properties(m, semi): proplists = {} for var, varprops in m.variables.items(): if varprops: proplists[var] = [ varprops.get(key, val).upper() for key, val in semi.variables[variable.type(var)] ] return proplists def _encode_variable(var, varprops): if var in varprops: props = ":" + ":".join(varprops[var]) del varprops[var] else: props = "" return var + props def _encode_rel(ep, semi, varprops, lnk, delim): roles = {role: None for role in ep.args if role != CONSTANT_ROLE} synopsis = semi.find_synopsis(ep.predicate, roles) args = [ _encode_variable(ep.args[d.name], varprops) for d in synopsis if d.name in ep.args ] if ep.carg is not None: args.append(f'"{ep.carg}"') return "{label}:{pred}{lnk}({args})".format( label=ep.label, pred=ep.predicate, lnk=str(ep.lnk) if lnk else "", args=delim.join(args), ) def _encode_hcons(hc): return f"{hc.hi} {hc.relation} {hc.lo}" def _encode_icons(ic): return f"{ic.left} {ic.relation} {ic.right}"