Source code for delphin.codecs.simplemrs

"""
Serialization functions for the SimpleMRS format.
"""

import re
from pathlib import Path
from typing import Optional

from delphin import predicate, variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons, MRSSyntaxError
from delphin.sembase import property_priority, role_priority
from delphin.util import Lexer

CODEC_INFO = {
    'representation': 'mrs',
}


TOP_FEATURE = 'TOP'


##############################################################################
##############################################################################
# Pickle-API methods


[docs] def load(source): """ Deserialize SimpleMRSs from a file (handle or filename) Args: source (str, file): input filename or file object Returns: a list of MRS objects """ if hasattr(source, 'read'): ms = list(_decode(source)) else: source = Path(source).expanduser() with source.open() as fh: ms = list(_decode(fh)) return ms
[docs] def loads(s): """ Deserialize SimpleMRS string representations Args: s (str): a SimpleMRS string Returns: a list of MRS objects """ ms = list(_decode(s.splitlines())) return ms
[docs] def dump(ms, destination, properties=True, lnk=True, indent=False, encoding='utf-8'): """ Serialize MRS objects to SimpleMRS and write to a file Args: ms: an iterator of MRS objects to serialize destination: filename or file object where data will be written properties: if `False`, suppress morphosemantic properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation encoding (str): if *destination* is a filename, write to the file with the given encoding; otherwise it is ignored """ text = dumps(ms, properties=properties, lnk=lnk, indent=indent) if hasattr(destination, 'write'): print(text, file=destination) else: destination = Path(destination).expanduser() with destination.open('w', encoding=encoding) as fh: print(text, file=fh)
[docs] def dumps(ms, properties=True, lnk=True, indent=False): """ Serialize MRS objects to a SimpleMRS representation Args: ms: an iterator of MRS objects to serialize properties: if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: a SimpleMRS string representation of a corpus of MRS objects """ return _encode(ms, properties, lnk, indent)
[docs] def decode(s): """ Deserialize an MRS object from a SimpleMRS string. """ lexer = SimpleMRSLexer.lex(s.splitlines()) return _decode_mrs(lexer)
[docs] def encode(m, properties=True, lnk=True, indent=False): """ Serialize a MRS object to a SimpleMRS string. Args: m: an MRS object properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: a SimpleMRS-serialization of the MRS object """ return _encode([m], properties, lnk, indent)
############################################################################## ############################################################################## # Deserialization SimpleMRSLexer = Lexer( tokens=[ (r'\[', 'LBRACK:['), (r'\]', 'RBRACK:]'), (r'<(?:-?\d+[:#]-?\d+' r'|@\d+' r'|\d+(?: +\d+)*)>', 'LNK:a lnk value'), (r'"([^"\\]*(?:\\.[^"\\]*)*)"', 'DQSTRING:a string'), (r"'([^ \n:<>\[\]]+)", 'SQSYMBOL:a quoted symbol'), (r'_[^\s_]+' # lemma r'_[nvajrscpqxud]' # pos r'(?:_(?:[^\s_<]|<(?![-0-9:#@ ]*>\s))+)?' # optional sense r'(?:_rel)?', # optional suffix 'PREDICATE:a surface predicate'), (r'<', 'LANGLE:<'), (r'>', 'RANGLE:>'), (r'([^\s:<>\[\]]+):', 'FEATURE:a feature'), (r'(?:[^ \n\]<]+' r'|<(?![-0-9:#@ ]*>\s))+', 'SYMBOL:a symbol'), (r'[^\s]', 'UNEXPECTED'), ], error_class=MRSSyntaxError) LBRACK = SimpleMRSLexer.tokentypes.LBRACK RBRACK = SimpleMRSLexer.tokentypes.RBRACK LNK = SimpleMRSLexer.tokentypes.LNK DQSTRING = SimpleMRSLexer.tokentypes.DQSTRING SQSYMBOL = SimpleMRSLexer.tokentypes.SQSYMBOL PREDICATE = SimpleMRSLexer.tokentypes.PREDICATE LANGLE = SimpleMRSLexer.tokentypes.LANGLE RANGLE = SimpleMRSLexer.tokentypes.RANGLE FEATURE = SimpleMRSLexer.tokentypes.FEATURE SYMBOL = SimpleMRSLexer.tokentypes.SYMBOL def _decode(lineiter): lexer = SimpleMRSLexer.lex(lineiter) try: while lexer.peek(): yield _decode_mrs(lexer) except StopIteration: pass def _decode_mrs(lexer): top = index = lnk = surface = identifier = None rels = [] hcons = [] icons = [] variables = {} lexer.expect_type(LBRACK) lnk = _decode_lnk(lexer) surface = _decode_dqstring(lexer.accept_type(DQSTRING)) feature = lexer.accept_type(FEATURE) while feature is not None: feature = feature.upper() if feature in ('LTOP', 'TOP'): top = lexer.expect_type(SYMBOL).lower() elif feature == 'INDEX': index = _decode_variable(lexer, variables) elif feature == 'RELS': lexer.expect_type(LANGLE) while lexer.peek()[0] == LBRACK: rels.append(_decode_rel(lexer, variables)) lexer.expect_type(RANGLE) elif feature == 'HCONS': lexer.expect_type(LANGLE) while lexer.peek()[0] == SYMBOL: hcons.append(_decode_cons(lexer, HCons, variables)) lexer.expect_type(RANGLE) elif feature == 'ICONS': lexer.expect_type(LANGLE) while lexer.peek()[0] == SYMBOL: icons.append(_decode_cons(lexer, ICons, variables)) lexer.expect_type(RANGLE) else: raise ValueError('invalid feature: ' + feature) feature = lexer.accept_type(FEATURE) lexer.expect_type(RBRACK) return MRS(top, index, rels, hcons, icons=icons, variables=variables, lnk=lnk, surface=surface, identifier=identifier) def _decode_lnk(lexer): lnk = lexer.accept_type(LNK) if lnk is not None: lnk = Lnk(lnk) return lnk def _decode_dqstring(dqstring: Optional[str]) -> Optional[str]: if dqstring is not None: dqstring = _unescape(dqstring) return dqstring def _decode_variable(lexer, variables): var = lexer.expect_type(SYMBOL).lower() if var not in variables: variables[var] = {} props = variables[var] if lexer.accept_type(LBRACK): lexer.accept_type(SYMBOL) # variable type feature = lexer.accept_type(FEATURE) while feature is not None: value = lexer.expect_type(SYMBOL) props[feature.upper()] = value.lower() feature = lexer.accept_type(FEATURE) lexer.expect_type(RBRACK) return var def _decode_rel(lexer, variables): args = {} surface = None lexer.expect_type(LBRACK) pred = _decode_predicate(lexer) lnk = _decode_lnk(lexer) surface = _decode_dqstring(lexer.accept_type(DQSTRING)) _, label = lexer.expect((FEATURE, 'LBL'), (SYMBOL, None)) # any remaining are arguments or a constant role = lexer.accept_type(FEATURE) while role is not None: role = role.upper() if role == 'CARG': value = _decode_dqstring(lexer.expect_type(DQSTRING)) else: value = _decode_variable(lexer, variables) args[role] = value role = lexer.accept_type(FEATURE) lexer.expect_type(RBRACK) return EP(pred, label.lower(), args=args, lnk=lnk, surface=surface, base=None) def _decode_predicate(lexer) -> str: predstring = lexer.accept_type(DQSTRING) if predstring is not None: predstring = _decode_dqstring(predstring) else: predstring = lexer.choice_type(SQSYMBOL, PREDICATE, SYMBOL)[1] return predicate.normalize(predstring) def _decode_cons(lexer, cls, variables): lhs = _decode_variable(lexer, variables) relation = lexer.expect_type(SYMBOL).lower() rhs = _decode_variable(lexer, variables) return cls(lhs, relation, rhs) ############################################################################## ############################################################################## # Encoding def _encode(ms, properties, lnk, indent): if indent is None or indent is False: indent = False # normalize None to False delim = ' ' else: indent = True # normalize integers to True delim = '\n' return delim.join(_encode_mrs(m, properties, lnk, indent) for m in ms) def _encode_mrs(m, properties, lnk, indent): delim = '\n ' if indent else ' ' if properties: varprops = dict(m.variables) else: varprops = {} parts = [ _encode_surface_info(m, lnk), _encode_hook(m, varprops, indent), _encode_rels(m.rels, varprops, lnk, indent), _encode_hcons(m.hcons), _encode_icons(m.icons, varprops) ] return '[ {} ]'.format( delim.join( ' '.join(tokens) for tokens in parts if tokens)) def _encode_surface_info(m, lnk): tokens = [] if lnk: if m.lnk: tokens.append(str(m.lnk)) if m.surface is not None: tokens.append('"{}"'.format(_escape(m.surface))) return tokens def _encode_hook(m, varprops, indent): delim = '\n ' if indent else ' ' tokens = [] if m.top is not None: tokens.append('{}: {}'.format(TOP_FEATURE, m.top)) if m.index is not None: tokens.append('INDEX: {}'.format(_encode_variable(m.index, varprops))) if tokens: tokens = [delim.join(tokens)] return tokens def _encode_variable(var, varprops): tokens = [var] if varprops.get(var): tokens.append('[') tokens.append(variable.type(var)) for prop in sorted(varprops[var], key=property_priority): val = varprops[var][prop] tokens.append(prop + ':') tokens.append(val) tokens.append(']') del varprops[var] return ' '.join(tokens) def _encode_rels(rels, varprops, lnk, indent): delim = ('\n ' + ' ' * len('RELS: < ')) if indent else ' ' tokens = [] for rel in rels: pred = _encode_predicate(rel.predicate) if lnk: pred += str(rel.lnk) reltoks = ['[', pred] if lnk and rel.surface is not None: reltoks.append('"{}"'.format(_escape(rel.surface))) reltoks.extend(('LBL:', rel.label)) for role in sorted(rel.args, key=role_priority): arg = rel.args[role] if role == CONSTANT_ROLE: arg = '"{}"'.format(_escape(arg)) else: arg = _encode_variable(arg, varprops) reltoks.extend((role + ':', arg)) reltoks.append(']') tokens.append(' '.join(reltoks)) if tokens: tokens = ['RELS: <'] + [delim.join(tokens)] + ['>'] return tokens def _encode_predicate(predicate: str) -> str: if re.search(r"[\s\"':<>[\]]", predicate): return f'"{_escape(predicate)}"' return predicate def _encode_hcons(hcons): tokens = ['{} {} {}'.format(hc.hi, hc.relation, hc.lo) for hc in hcons] if tokens: tokens = ['HCONS: <'] + [' '.join(tokens)] + ['>'] return tokens def _encode_icons(icons, varprops): tokens = ['{} {} {}'.format(_encode_variable(ic.left, varprops), ic.relation, _encode_variable(ic.right, varprops)) for ic in icons] if tokens: tokens = ['ICONS: <'] + [' '.join(tokens)] + ['>'] return tokens # Character Escaping _ESCAPES = { '\\': '\\\\', '"': '\\"', } _UNESCAPES = { '\\\\': '\\', '\\"': '"', } def _escape(s: str) -> str: return "".join(_ESCAPES.get(c, c) for c in s) def _unescape(s: str) -> str: if not s: return s cs = [] i = 0 while i < len(s): if s[i] == '\\' and (i + 1) < len(s): cs.append(s[i+1]) i += 2 else: cs.append(s[i]) i += 1 return "".join(cs)