# -*- coding: utf-8 -*-
"""
MRX (XML for MRS) serialization and deserialization.
"""
import io
import re
from pathlib import Path
import xml.etree.ElementTree as etree
from delphin.mrs import MRS, EP, HCons, ICons, CONSTANT_ROLE
from delphin import predicate
from delphin.lnk import Lnk
from delphin import variable
from delphin.sembase import role_priority, property_priority
CODEC_INFO = {
'representation': 'mrs',
}
HEADER = '<mrs-list>'
JOINER = ''
FOOTER = '</mrs-list>'
##############################################################################
##############################################################################
# Pickle-API methods
[docs]def load(source):
"""
Deserialize MRX from a file (handle or filename)
Args:
source (str, file): input filename or file object
Returns:
a list of MRS objects
"""
if hasattr(source, 'read'):
ms = list(_decode(source))
else:
source = Path(source).expanduser()
with source.open() as fh:
ms = list(_decode(fh))
return ms
[docs]def loads(s):
"""
Deserialize MRX string representations
Args:
s (str): an MRX string
Returns:
a list of MRS objects
"""
ms = list(_decode(io.StringIO(s)))
return ms
[docs]def dump(ms, destination, properties=True, lnk=True,
indent=False, encoding='utf-8'):
"""
Serialize MRS objects to MRX and write to a file
Args:
ms: an iterator of MRS objects to serialize
destination: filename or file object where data will be written
properties: if `False`, suppress morphosemantic properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
encoding (str): if *destination* is a filename, write to the
file with the given encoding; otherwise it is ignored
"""
text = dumps(ms, properties=properties, lnk=lnk, indent=indent)
if hasattr(destination, 'write'):
print(text, file=destination)
else:
destination = Path(destination).expanduser()
with destination.open('w', encoding=encoding) as fh:
print(text, file=fh)
[docs]def dumps(ms, properties=True, lnk=True, indent=False):
"""
Serialize MRS objects to an MRX representation
Args:
ms: an iterator of MRS objects to serialize
properties: if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an MRX string representation of a corpus of MRS objects
"""
e = _encode(ms, properties, lnk)
string = _tostring(e, indent, 1)
return string
[docs]def decode(s):
"""
Deserialize an MRS object from an MRX string.
"""
elem = etree.fromstring(s)
return _decode_mrs(elem)
[docs]def encode(m, properties=True, lnk=True, indent=False):
"""
Serialize a MRS object to an MRX string.
Args:
m: an MRS object
properties (bool): if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an MRX-serialization of the MRS object
"""
e = _encode_mrs(m, properties, lnk)
string = _tostring(e, indent, 0)
return string
##############################################################################
##############################################################################
# Decoding
def _decode(fh):
# <!ELEMENT mrs-list (mrs)*>
# if memory becomes a big problem, consider catching start events,
# get the root element (later start events can be ignored), and
# root.clear() after decoding each mrs
for _, elem in etree.iterparse(fh, events=('end',)):
if elem.tag == 'mrs':
yield _decode_mrs(elem)
elem.clear()
def _decode_mrs(elem):
# <!ELEMENT mrs (label, var, (ep|hcons)*)>
# <!ATTLIST mrs
# cfrom CDATA #IMPLIED
# cto CDATA #IMPLIED
# surface CDATA #IMPLIED
# ident CDATA #IMPLIED >
elem = elem.find('.') # in case elem is ElementTree rather than Element
variables = {}
top = elem.find('label')
if top is not None:
top = _decode_label(top)
index = elem.find('var')
if index is not None:
index = _decode_var(index, variables=variables)
rels = [_decode_ep(ep, variables) for ep in elem.iter('ep')]
hcons = [_decode_hcons(hc, variables) for hc in elem.iter('hcons')]
icons = [_decode_icons(ic, variables) for ic in elem.iter('icons')]
return MRS(top,
index,
rels,
hcons,
icons=icons,
variables=variables,
lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')),
surface=elem.get('surface'),
identifier=elem.get('ident'))
def _decode_label(elem):
# <!ELEMENT label (extrapair*)>
# <!ATTLIST label
# vid CDATA #REQUIRED >
vid = elem.get('vid')
# ignoring extrapairs
return 'h' + vid
def _decode_var(elem, variables):
# <!ELEMENT var (extrapair*)>
# <!ATTLIST var
# vid CDATA #REQUIRED
# sort (x|e|h|u|l|i) #IMPLIED >
vid = elem.get('vid')
srt = elem.get('sort').lower()
var = srt + vid
varprops = variables.setdefault(var, {})
for prop, val in _decode_extrapairs(elem.iter('extrapair')):
varprops[prop] = val
return var
def _decode_extrapairs(elems):
# <!ELEMENT extrapair (path,value)>
# <!ELEMENT path (#PCDATA)>
# <!ELEMENT value (#PCDATA)>
return [(e.find('path').text.upper(), e.find('value').text.lower())
for e in elems]
def _decode_ep(elem, variables=None):
# <!ELEMENT ep ((pred|spred|realpred), label, fvpair*)>
# <!ATTLIST ep
# cfrom CDATA #IMPLIED
# cto CDATA #IMPLIED
# surface CDATA #IMPLIED
# base CDATA #IMPLIED >
args = _decode_args(elem, variables=variables)
return EP(_decode_pred(elem.find('./')),
_decode_label(elem.find('label')),
args=args,
lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')),
surface=elem.get('surface'),
base=elem.get('base'))
def _decode_pred(elem):
# <!ELEMENT pred (#PCDATA)>
# <!ELEMENT spred (#PCDATA)>
# <!ELEMENT realpred EMPTY>
# <!ATTLIST realpred
# lemma CDATA #REQUIRED
# pos (v|n|j|r|p|q|c|x|u|a|s) #REQUIRED
# sense CDATA #IMPLIED >
if elem.tag in ('pred', 'spred'):
return elem.text
elif elem.tag == 'realpred':
return predicate.create(elem.get('lemma'),
elem.get('pos'),
elem.get('sense'))
def _decode_args(elem, variables=None):
# <!ELEMENT fvpair (rargname, (var|constant))>
# This code assumes that only cargs have constant values, and all
# other args (including IVs) have var values.
args = {}
for e in elem.findall('fvpair'):
rargname = e.find('rargname').text.upper()
if e.find('constant') is not None:
argval = e.find('constant').text
elif e.find('var') is not None:
argval = _decode_var(e.find('var'), variables=variables)
args[rargname] = argval
return args
def _decode_hcons(elem, variables):
# <!ELEMENT hcons (hi, lo)>
# <!ATTLIST hcons
# hreln (qeq|lheq|outscopes) #REQUIRED >
# <!ELEMENT hi (var)>
# <!ELEMENT lo (label|var)>
hi = _decode_var(elem.find('hi/var'), variables)
lo = elem.find('lo/')
if lo.tag == 'var':
lo = _decode_var(lo, variables)
else:
lo = _decode_label(lo)
return HCons(hi, elem.get('hreln'), lo)
# this isn't part of the spec; just putting here in case it's added later
def _decode_icons(elem, variables):
# <!ELEMENT icons (left, right)>
# <!ATTLIST icons
# ireln #REQUIRED >
# <!ELEMENT left (var)>
# <!ELEMENT right (var)>
return ICons(_decode_var(elem.find('left/var'), variables),
elem.get('ireln'),
_decode_var(elem.find('right/var'), variables))
def _decode_lnk(cfrom, cto):
if cfrom is cto is None:
return None
elif None in (cfrom, cto):
raise ValueError('Both cfrom and cto, or neither, must be specified.')
else:
return Lnk.charspan(cfrom, cto)
##############################################################################
##############################################################################
# Encoding
def _encode(ms, properties, lnk):
e = etree.Element('mrs-list')
for m in ms:
e.append(_encode_mrs(m, properties, lnk))
return e
def _encode_mrs(m, properties, lnk):
# attempt to convert if necessary
if not isinstance(m, MRS):
m = MRS.from_xmrs(m)
if properties:
varprops = dict(m.variables)
else:
varprops = {}
attributes = {}
if lnk:
attributes['cfrom'] = str(m.cfrom)
attributes['cto'] = str(m.cto)
if m.surface is not None:
attributes['surface'] = m.surface
if m.identifier is not None:
attributes['ident'] = m.identifier
e = etree.Element('mrs', attrib=attributes)
if m.top is not None:
e.append(_encode_label(m.top))
if m.index is not None:
e.append(_encode_variable(m.index, varprops))
for ep in m.rels:
e.append(_encode_ep(ep, varprops, lnk))
for hc in m.hcons:
e.append(_encode_hcon(hc, varprops))
for ic in m.icons:
e.append(_encode_icon(ic, varprops))
return e
def _encode_label(label):
_, vid = variable.split(label)
return etree.Element('label', vid=vid)
def _encode_variable(v, varprops):
srt, vid = variable.split(v)
var = etree.Element('var', vid=vid, sort=srt)
if varprops.get(v):
for key in sorted(varprops[v], key=property_priority):
val = varprops[v][key]
var.append(_encode_extrapair(key, val))
del varprops[v]
return var
def _encode_extrapair(key, value):
extrapair = etree.Element('extrapair')
path = etree.Element('path')
path.text = key
val = etree.Element('value')
val.text = value
extrapair.extend([path, val])
return extrapair
def _encode_ep(ep, varprops, lnk):
attributes = {}
if lnk:
attributes['cfrom'] = str(ep.cfrom)
attributes['cto'] = str(ep.cto)
if ep.surface:
attributes['surface'] = ep.surface
if ep.base:
attributes['base'] = ep.base
e = etree.Element('ep', attrib=attributes)
e.append(_encode_pred(ep.predicate))
e.append(_encode_label(ep.label))
for role in sorted(ep.args, key=role_priority):
val = ep.args[role]
if role == CONSTANT_ROLE:
e.append(_encode_arg(CONSTANT_ROLE, _encode_constant(val)))
else:
e.append(_encode_arg(role, _encode_variable(val, varprops)))
return e
def _encode_pred(pred):
p = None
if predicate.is_surface(pred):
lemma, pos, sense = predicate.split(pred)
attributes = {'lemma': lemma, 'pos': pos}
if sense is not None:
attributes['sense'] = sense
p = etree.Element('realpred', attrib=attributes)
elif predicate.is_abstract(pred):
p = etree.Element('pred')
p.text = pred
else:
p = etree.Element('spred')
p.text = pred
return p
def _encode_arg(key, value):
fvpair = etree.Element('fvpair')
rargname = etree.Element('rargname')
rargname.text = key
fvpair.append(rargname)
fvpair.append(value)
return fvpair
def _encode_constant(value):
const = etree.Element('constant')
const.text = value
return const
def _encode_hcon(hcon, varprops):
hcons_ = etree.Element('hcons', hreln=hcon.relation)
hi = etree.Element('hi')
hi.append(_encode_variable(hcon.hi, varprops))
lo = etree.Element('lo')
lo.append(_encode_label(hcon.lo))
hcons_.extend([hi, lo])
return hcons_
def _encode_icon(icon, varprops):
icons_ = etree.Element('icons', ireln=icon.relation)
left = etree.Element('left')
left.append(_encode_variable(icon.left, varprops))
right = etree.Element('right')
right.append(_encode_variable(icon.right, varprops))
icons_.extend([left, right])
return icons_
def _tostring(e, indent, offset):
string = etree.tostring(e, encoding='unicode')
if indent is not None and indent is not False:
if indent is True:
indent = 0
def indentmatch(m):
return '\n' + (' ' * indent * (m.lastindex + offset)) + m.group()
string = re.sub(
r'(</mrs-list>)'
r'|(<mrs[^-]|</mrs>)'
r'|(<ep[>\s]|<fvpair>|<extrapair>|<hcons\s|<icons\s>)',
indentmatch,
string)
return string.strip()