Source code for delphin.codecs.indexedmrs


"""
Serialization for the Indexed MRS format.
"""

from pathlib import Path

from delphin import variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons, MRSSyntaxError
from delphin.util import Lexer

CODEC_INFO = {
    'representation': 'mrs',
}


##############################################################################
##############################################################################
# Pickle-API methods

[docs] def load(source, semi): """ Deserialize Indexed MRS from a file (handle or filename) Args: source (str, file): input filename or file object semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS Returns: a list of MRS objects """ if hasattr(source, 'read'): ms = list(_decode(source, semi)) else: source = Path(source).expanduser() with source.open() as fh: ms = list(_decode(fh, semi)) return ms
[docs] def loads(s, semi, single=False, encoding='utf-8'): """ Deserialize Indexed MRS string representations Args: s (str): an Indexed MRS string semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS Returns: a list of MRS objects """ ms = list(_decode(s.splitlines(), semi)) return ms
[docs] def dump(ms, destination, semi, properties=True, lnk=True, indent=False, encoding='utf-8'): """ Serialize MRS objects to Indexed MRS and write to a file Args: ms: an iterator of MRS objects to serialize destination: filename or file object where data will be written semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties: if `False`, suppress morphosemantic properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation encoding (str): if *destination* is a filename, write to the file with the given encoding; otherwise it is ignored """ text = dumps(ms, semi, properties=properties, lnk=lnk, indent=indent) if hasattr(destination, 'write'): print(text, file=destination) else: destination = Path(destination).expanduser() with destination.open('w', encoding=encoding) as fh: print(text, file=fh)
[docs] def dumps(ms, semi, properties=True, lnk=True, indent=False): """ Serialize MRS objects to an Indexed MRS representation Args: ms: an iterator of MRS objects to serialize semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties: if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an Indexed MRS string representation of a corpus of MRS objects """ return _encode(ms, semi, properties, lnk, indent)
[docs] def decode(s, semi): """ Deserialize a MRS object from an Indexed MRS string. Args: s (str): an Indexed MRS string semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS """ lexer = _IndexedMRSLexer.lex(s.splitlines()) return _decode_indexed(lexer, semi)
[docs] def encode(d, semi, properties=True, lnk=True, indent=False): """ Serialize a MRS object to an Indexed MRS string. Args: d: a MRS object semi (:class:`SemI`): the semantic interface for the grammar that produced the MRS properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an Indexed MRS-serialization of the MRS object """ return _encode_indexed(d, semi, properties, lnk, indent)
############################################################################## ############################################################################## # Decoding _IndexedMRSLexer = Lexer( tokens=[ (r'<-?\d+:-?\d+>', 'LNK:a lnk value'), (r'"([^"\\]*(?:\\.[^"\\]*)*)"', 'DQSTRING:a string'), (r'<', 'LANGLE:<'), (r'>', 'RANGLE:>'), (r'\{', 'LBRACE:{'), (r'\}', 'RBRACE:}'), (r'\(', 'LPAREN:('), (r'\)', 'RPAREN:)'), (r',', 'COMMA:,'), (r':', 'COLON::'), (r'[^\s"\'()\/,:;<=>[\]{}]+', 'SYMBOL:a symbol'), (r'[^\s]', 'UNEXPECTED') ], error_class=MRSSyntaxError) LNK = _IndexedMRSLexer.tokentypes.LNK DQSTRING = _IndexedMRSLexer.tokentypes.DQSTRING LANGLE = _IndexedMRSLexer.tokentypes.LANGLE RANGLE = _IndexedMRSLexer.tokentypes.RANGLE LBRACE = _IndexedMRSLexer.tokentypes.LBRACE RBRACE = _IndexedMRSLexer.tokentypes.RBRACE LPAREN = _IndexedMRSLexer.tokentypes.LPAREN RPAREN = _IndexedMRSLexer.tokentypes.RPAREN COMMA = _IndexedMRSLexer.tokentypes.COMMA COLON = _IndexedMRSLexer.tokentypes.COLON SYMBOL = _IndexedMRSLexer.tokentypes.SYMBOL def _decode(lineiter, semi): lexer = _IndexedMRSLexer.lex(lineiter) try: while lexer.peek(): yield _decode_indexed(lexer, semi) except StopIteration: pass def _decode_indexed(lexer, semi): icons = lnk = surface = identifier = None variables = {} lexer.expect_type(LANGLE) top, _, index = lexer.expect_type(SYMBOL, COMMA, SYMBOL) if lexer.accept_type(COLON): variables[index] = _decode_proplist(lexer) lexer.expect_type(COMMA) rels = _decode_rels(lexer, variables, semi) hcons = _decode_cons(lexer, HCons) if lexer.accept_type(COMMA): icons = _decode_cons(lexer, ICons) lexer.expect_type(RANGLE) _match_properties(variables, semi) return MRS(top=top, index=index, rels=rels, hcons=hcons, icons=icons, variables=variables, lnk=lnk, surface=surface, identifier=identifier) def _decode_proplist(lexer): proplist = [lexer.expect_type(SYMBOL)] while lexer.accept_type(COLON): propval = lexer.expect_type(SYMBOL) proplist.append(propval) return proplist def _decode_rels(lexer, variables, semi): rels = [] lexer.expect_type(LBRACE) if lexer.peek()[0] != RBRACE: while True: rels.append(_decode_rel(lexer, variables, semi)) if not lexer.accept_type(COMMA): break lexer.expect_type(RBRACE, COMMA) return rels def _decode_rel(lexer, variables, semi): label, _, pred = lexer.expect_type(SYMBOL, COLON, SYMBOL) lnk = _decode_lnk(lexer) arglist, carg = _decode_arglist(lexer, variables) argtypes = [variable.type(arg) for arg in arglist] synopsis = semi.find_synopsis(pred, argtypes) args = {d[0]: v for d, v in zip(synopsis, arglist)} if carg: args[CONSTANT_ROLE] = carg return EP( pred, label, args=args, lnk=lnk, surface=None, base=None) def _decode_lnk(lexer): lnk = lexer.accept_type(LNK) if lnk is not None: lnk = Lnk(lnk) return lnk def _decode_arglist(lexer, variables): arglist = [] carg = None lexer.expect_type(LPAREN) if lexer.peek()[0] != RPAREN: while True: gid, arg = lexer.choice_type(SYMBOL, DQSTRING) if gid == SYMBOL: if lexer.accept_type(COLON): variables[arg] = _decode_proplist(lexer) arglist.append(arg) else: carg = arg if not lexer.accept_type(COMMA): break lexer.expect_type(RPAREN) return arglist, carg def _decode_cons(lexer, cls): cons = [] lexer.expect_type(LBRACE) if lexer.peek()[0] != RBRACE: while True: lhs, reln, rhs = lexer.expect_type(SYMBOL, SYMBOL, SYMBOL) cons.append(cls(lhs, reln, rhs)) if not lexer.accept_type(COMMA): break lexer.expect_type(RBRACE) return cons def _match_properties(variables, semi): for var, propvals in variables.items(): if not propvals: continue semiprops = semi.variables[variable.type(var)] assert len(semiprops) == len(propvals) assert all(semi.properties.subsumes(sp[1], pv) for sp, pv in zip(semiprops, propvals)) variables[var] = {sp[0]: pv for sp, pv in zip(semiprops, propvals)} ############################################################################## ############################################################################## # Encoding def _encode(ms, semi, properties, lnk, indent): if indent is None or indent is False: delim = ' ' else: delim = '\n' return delim.join( _encode_indexed(m, semi, properties, lnk, indent) for m in ms) def _encode_indexed(m, semi, properties, lnk, indent): if indent is None or indent is False: i1 = ',{{{}}}' i2 = i3 = ',' start = '<' end = '>' hook = '{},{}' else: if indent is True: indent = 2 i1 = ',\n' + (' ' * indent) + '{{' + (' ' * (indent - 1)) + '{} }}' i2 = ',\n' + (' ' * indent) i3 = ', ' start = '< ' end = ' >' hook = '{}, {}' if properties: varprops = _prepare_variable_properties(m, semi) else: varprops = {} body = [ hook.format(m.top, _encode_variable(m.index, varprops)), i1.format(i2.join(_encode_rel(ep, semi, varprops, lnk, i3) for ep in m.rels)), i1.format(i2.join(_encode_hcons(hc) for hc in m.hcons)) ] if m.icons: body.append( i1.format(i2.join(_encode_icons(ic) for ic in m.icons))) return start + ''.join(body) + end def _prepare_variable_properties(m, semi): proplists = {} for var, varprops in m.variables.items(): if varprops: proplists[var] = [ varprops.get(key, val).upper() for key, val in semi.variables[variable.type(var)]] return proplists def _encode_variable(var, varprops): if var in varprops: props = ':' + ':'.join(varprops[var]) del varprops[var] else: props = '' return var + props def _encode_rel(ep, semi, varprops, lnk, delim): roles = {role: None for role in ep.args if role != CONSTANT_ROLE} synopsis = semi.find_synopsis(ep.predicate, roles) args = [_encode_variable(ep.args[d.name], varprops) for d in synopsis if d.name in ep.args] if ep.carg is not None: args.append('"{}"'.format(ep.carg)) return '{label}:{pred}{lnk}({args})'.format( label=ep.label, pred=ep.predicate, lnk=str(ep.lnk) if lnk else '', args=delim.join(args)) def _encode_hcons(hc): return '{} {} {}'.format(hc.hi, hc.relation, hc.lo) def _encode_icons(ic): return '{} {} {}'.format(ic.left, ic.relation, ic.right)