Source code for delphin.codecs.mrx

"""
MRX (XML for MRS) serialization and deserialization.
"""

import io
import re
import xml.etree.ElementTree as etree
from pathlib import Path

from delphin import predicate, variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons
from delphin.sembase import property_priority, role_priority

CODEC_INFO = {
    'representation': 'mrs',
}

HEADER = '<mrs-list>'
JOINER = ''
FOOTER = '</mrs-list>'


##############################################################################
##############################################################################
# Pickle-API methods

[docs] def load(source): """ Deserialize MRX from a file (handle or filename) Args: source (str, file): input filename or file object Returns: a list of MRS objects """ if hasattr(source, 'read'): ms = list(_decode(source)) else: source = Path(source).expanduser() with source.open() as fh: ms = list(_decode(fh)) return ms
[docs] def loads(s): """ Deserialize MRX string representations Args: s (str): an MRX string Returns: a list of MRS objects """ ms = list(_decode(io.StringIO(s))) return ms
[docs] def dump(ms, destination, properties=True, lnk=True, indent=False, encoding='utf-8'): """ Serialize MRS objects to MRX and write to a file Args: ms: an iterator of MRS objects to serialize destination: filename or file object where data will be written properties: if `False`, suppress morphosemantic properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation encoding (str): if *destination* is a filename, write to the file with the given encoding; otherwise it is ignored """ text = dumps(ms, properties=properties, lnk=lnk, indent=indent) if hasattr(destination, 'write'): print(text, file=destination) else: destination = Path(destination).expanduser() with destination.open('w', encoding=encoding) as fh: print(text, file=fh)
[docs] def dumps(ms, properties=True, lnk=True, indent=False): """ Serialize MRS objects to an MRX representation Args: ms: an iterator of MRS objects to serialize properties: if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an MRX string representation of a corpus of MRS objects """ e = _encode(ms, properties, lnk) string = _tostring(e, indent, 1) return string
[docs] def decode(s): """ Deserialize an MRS object from an MRX string. """ elem = etree.fromstring(s) return _decode_mrs(elem)
[docs] def encode(m, properties=True, lnk=True, indent=False): """ Serialize a MRS object to an MRX string. Args: m: an MRS object properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: an MRX-serialization of the MRS object """ e = _encode_mrs(m, properties, lnk) string = _tostring(e, indent, 0) return string
############################################################################## ############################################################################## # Decoding def _decode(fh): # <!ELEMENT mrs-list (mrs)*> # if memory becomes a big problem, consider catching start events, # get the root element (later start events can be ignored), and # root.clear() after decoding each mrs for _, elem in etree.iterparse(fh, events=('end',)): if elem.tag == 'mrs': yield _decode_mrs(elem) elem.clear() def _decode_mrs(elem): # <!ELEMENT mrs (label, var, (ep|hcons)*)> # <!ATTLIST mrs # cfrom CDATA #IMPLIED # cto CDATA #IMPLIED # surface CDATA #IMPLIED # ident CDATA #IMPLIED > elem = elem.find('.') # in case elem is ElementTree rather than Element variables = {} top = elem.find('label') if top is not None: top = _decode_label(top) index = elem.find('var') if index is not None: index = _decode_var(index, variables=variables) rels = [_decode_ep(ep, variables) for ep in elem.iter('ep')] hcons = [_decode_hcons(hc, variables) for hc in elem.iter('hcons')] icons = [_decode_icons(ic, variables) for ic in elem.iter('icons')] return MRS(top, index, rels, hcons, icons=icons, variables=variables, lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')), surface=elem.get('surface'), identifier=elem.get('ident')) def _decode_label(elem): # <!ELEMENT label (extrapair*)> # <!ATTLIST label # vid CDATA #REQUIRED > vid = elem.get('vid') # ignoring extrapairs return 'h' + vid def _decode_var(elem, variables): # <!ELEMENT var (extrapair*)> # <!ATTLIST var # vid CDATA #REQUIRED # sort (x|e|h|u|l|i) #IMPLIED > vid = elem.get('vid') srt = elem.get('sort').lower() var = srt + vid varprops = variables.setdefault(var, {}) for prop, val in _decode_extrapairs(elem.iter('extrapair')): varprops[prop] = val return var def _decode_extrapairs(elems): # <!ELEMENT extrapair (path,value)> # <!ELEMENT path (#PCDATA)> # <!ELEMENT value (#PCDATA)> return [(e.find('path').text.upper(), e.find('value').text.lower()) for e in elems] def _decode_ep(elem, variables=None): # <!ELEMENT ep ((pred|spred|realpred), label, fvpair*)> # <!ATTLIST ep # cfrom CDATA #IMPLIED # cto CDATA #IMPLIED # surface CDATA #IMPLIED # base CDATA #IMPLIED > args = _decode_args(elem, variables=variables) return EP(_decode_pred(elem.find('./')), _decode_label(elem.find('label')), args=args, lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')), surface=elem.get('surface'), base=elem.get('base')) def _decode_pred(elem): # <!ELEMENT pred (#PCDATA)> # <!ELEMENT spred (#PCDATA)> # <!ELEMENT realpred EMPTY> # <!ATTLIST realpred # lemma CDATA #REQUIRED # pos (v|n|j|r|p|q|c|x|u|a|s) #REQUIRED # sense CDATA #IMPLIED > if elem.tag in ('pred', 'spred'): return elem.text elif elem.tag == 'realpred': return predicate.create(elem.get('lemma'), elem.get('pos'), elem.get('sense')) def _decode_args(elem, variables=None): # <!ELEMENT fvpair (rargname, (var|constant))> # This code assumes that only cargs have constant values, and all # other args (including IVs) have var values. args = {} for e in elem.findall('fvpair'): rargname = e.find('rargname').text.upper() if e.find('constant') is not None: argval = e.find('constant').text elif e.find('var') is not None: argval = _decode_var(e.find('var'), variables=variables) args[rargname] = argval return args def _decode_hcons(elem, variables): # <!ELEMENT hcons (hi, lo)> # <!ATTLIST hcons # hreln (qeq|lheq|outscopes) #REQUIRED > # <!ELEMENT hi (var)> # <!ELEMENT lo (label|var)> hi = _decode_var(elem.find('hi/var'), variables) lo = elem.find('lo/') if lo.tag == 'var': lo = _decode_var(lo, variables) else: lo = _decode_label(lo) return HCons(hi, elem.get('hreln'), lo) # this isn't part of the spec; just putting here in case it's added later def _decode_icons(elem, variables): # <!ELEMENT icons (left, right)> # <!ATTLIST icons # ireln #REQUIRED > # <!ELEMENT left (var)> # <!ELEMENT right (var)> return ICons(_decode_var(elem.find('left/var'), variables), elem.get('ireln'), _decode_var(elem.find('right/var'), variables)) def _decode_lnk(cfrom, cto): if cfrom is cto is None: return None elif None in (cfrom, cto): raise ValueError('Both cfrom and cto, or neither, must be specified.') else: return Lnk.charspan(cfrom, cto) ############################################################################## ############################################################################## # Encoding def _encode(ms, properties, lnk): e = etree.Element('mrs-list') for m in ms: e.append(_encode_mrs(m, properties, lnk)) return e def _encode_mrs(m, properties, lnk): if properties: varprops = dict(m.variables) else: varprops = {} attributes = {} if lnk: attributes['cfrom'] = str(m.cfrom) attributes['cto'] = str(m.cto) if m.surface is not None: attributes['surface'] = m.surface if m.identifier is not None: attributes['ident'] = m.identifier e = etree.Element('mrs', attrib=attributes) if m.top is not None: e.append(_encode_label(m.top)) if m.index is not None: e.append(_encode_variable(m.index, varprops)) for ep in m.rels: e.append(_encode_ep(ep, varprops, lnk)) for hc in m.hcons: e.append(_encode_hcon(hc, varprops)) for ic in m.icons: e.append(_encode_icon(ic, varprops)) return e def _encode_label(label): _, vid = variable.split(label) return etree.Element('label', vid=vid) def _encode_variable(v, varprops): srt, vid = variable.split(v) var = etree.Element('var', vid=vid, sort=srt) if varprops.get(v): for key in sorted(varprops[v], key=property_priority): val = varprops[v][key] var.append(_encode_extrapair(key, val)) del varprops[v] return var def _encode_extrapair(key, value): extrapair = etree.Element('extrapair') path = etree.Element('path') path.text = key val = etree.Element('value') val.text = value extrapair.extend([path, val]) return extrapair def _encode_ep(ep, varprops, lnk): attributes = {} if lnk: attributes['cfrom'] = str(ep.cfrom) attributes['cto'] = str(ep.cto) if ep.surface: attributes['surface'] = ep.surface if ep.base: attributes['base'] = ep.base e = etree.Element('ep', attrib=attributes) e.append(_encode_pred(ep.predicate)) e.append(_encode_label(ep.label)) for role in sorted(ep.args, key=role_priority): val = ep.args[role] if role == CONSTANT_ROLE: e.append(_encode_arg(CONSTANT_ROLE, _encode_constant(val))) else: e.append(_encode_arg(role, _encode_variable(val, varprops))) return e def _encode_pred(pred): p = None if predicate.is_surface(pred): lemma, pos, sense = predicate.split(pred) attributes = {'lemma': lemma, 'pos': pos} if sense is not None: attributes['sense'] = sense p = etree.Element('realpred', attrib=attributes) elif predicate.is_abstract(pred): p = etree.Element('pred') p.text = pred else: p = etree.Element('spred') p.text = pred return p def _encode_arg(key, value): fvpair = etree.Element('fvpair') rargname = etree.Element('rargname') rargname.text = key fvpair.append(rargname) fvpair.append(value) return fvpair def _encode_constant(value): const = etree.Element('constant') const.text = value return const def _encode_hcon(hcon, varprops): hcons_ = etree.Element('hcons', hreln=hcon.relation) hi = etree.Element('hi') hi.append(_encode_variable(hcon.hi, varprops)) lo = etree.Element('lo') lo.append(_encode_label(hcon.lo)) hcons_.extend([hi, lo]) return hcons_ def _encode_icon(icon, varprops): icons_ = etree.Element('icons', ireln=icon.relation) left = etree.Element('left') left.append(_encode_variable(icon.left, varprops)) right = etree.Element('right') right.append(_encode_variable(icon.right, varprops)) icons_.extend([left, right]) return icons_ def _tostring(e, indent, offset): string = etree.tostring(e, encoding='unicode') if indent is not None and indent is not False: if indent is True: indent = 0 def indentmatch(m): return '\n' + (' ' * indent * (m.lastindex + offset)) + m.group() string = re.sub( r'(</mrs-list>)' r'|(<mrs[^-]|</mrs>)' r'|(<ep[>\s]|<fvpair>|<extrapair>|<hcons\s|<icons\s>)', indentmatch, string) return string.strip()