"""
MRX (XML for MRS) serialization and deserialization.
"""
import io
import re
import xml.etree.ElementTree as etree
from pathlib import Path
from delphin import predicate, variable
from delphin.lnk import Lnk
from delphin.mrs import CONSTANT_ROLE, EP, MRS, HCons, ICons
from delphin.sembase import property_priority, role_priority
CODEC_INFO = {
'representation': 'mrs',
}
HEADER = '<mrs-list>'
JOINER = ''
FOOTER = '</mrs-list>'
##############################################################################
##############################################################################
# Pickle-API methods
[docs]
def load(source):
"""
Deserialize MRX from a file (handle or filename)
Args:
source (str, file): input filename or file object
Returns:
a list of MRS objects
"""
if hasattr(source, 'read'):
ms = list(_decode(source))
else:
source = Path(source).expanduser()
with source.open() as fh:
ms = list(_decode(fh))
return ms
[docs]
def loads(s):
"""
Deserialize MRX string representations
Args:
s (str): an MRX string
Returns:
a list of MRS objects
"""
ms = list(_decode(io.StringIO(s)))
return ms
[docs]
def dump(ms, destination, properties=True, lnk=True,
indent=False, encoding='utf-8'):
"""
Serialize MRS objects to MRX and write to a file
Args:
ms: an iterator of MRS objects to serialize
destination: filename or file object where data will be written
properties: if `False`, suppress morphosemantic properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
encoding (str): if *destination* is a filename, write to the
file with the given encoding; otherwise it is ignored
"""
text = dumps(ms, properties=properties, lnk=lnk, indent=indent)
if hasattr(destination, 'write'):
print(text, file=destination)
else:
destination = Path(destination).expanduser()
with destination.open('w', encoding=encoding) as fh:
print(text, file=fh)
[docs]
def dumps(ms, properties=True, lnk=True, indent=False):
"""
Serialize MRS objects to an MRX representation
Args:
ms: an iterator of MRS objects to serialize
properties: if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an MRX string representation of a corpus of MRS objects
"""
e = _encode(ms, properties, lnk)
string = _tostring(e, indent, 1)
return string
[docs]
def decode(s):
"""
Deserialize an MRS object from an MRX string.
"""
elem = etree.fromstring(s)
return _decode_mrs(elem)
[docs]
def encode(m, properties=True, lnk=True, indent=False):
"""
Serialize a MRS object to an MRX string.
Args:
m: an MRS object
properties (bool): if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
an MRX-serialization of the MRS object
"""
e = _encode_mrs(m, properties, lnk)
string = _tostring(e, indent, 0)
return string
##############################################################################
##############################################################################
# Decoding
def _decode(fh):
# <!ELEMENT mrs-list (mrs)*>
# if memory becomes a big problem, consider catching start events,
# get the root element (later start events can be ignored), and
# root.clear() after decoding each mrs
for _, elem in etree.iterparse(fh, events=('end',)):
if elem.tag == 'mrs':
yield _decode_mrs(elem)
elem.clear()
def _decode_mrs(elem):
# <!ELEMENT mrs (label, var, (ep|hcons)*)>
# <!ATTLIST mrs
# cfrom CDATA #IMPLIED
# cto CDATA #IMPLIED
# surface CDATA #IMPLIED
# ident CDATA #IMPLIED >
elem = elem.find('.') # in case elem is ElementTree rather than Element
variables = {}
top = elem.find('label')
if top is not None:
top = _decode_label(top)
index = elem.find('var')
if index is not None:
index = _decode_var(index, variables=variables)
rels = [_decode_ep(ep, variables) for ep in elem.iter('ep')]
hcons = [_decode_hcons(hc, variables) for hc in elem.iter('hcons')]
icons = [_decode_icons(ic, variables) for ic in elem.iter('icons')]
return MRS(top,
index,
rels,
hcons,
icons=icons,
variables=variables,
lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')),
surface=elem.get('surface'),
identifier=elem.get('ident'))
def _decode_label(elem):
# <!ELEMENT label (extrapair*)>
# <!ATTLIST label
# vid CDATA #REQUIRED >
vid = elem.get('vid')
# ignoring extrapairs
return 'h' + vid
def _decode_var(elem, variables):
# <!ELEMENT var (extrapair*)>
# <!ATTLIST var
# vid CDATA #REQUIRED
# sort (x|e|h|u|l|i) #IMPLIED >
vid = elem.get('vid')
srt = elem.get('sort').lower()
var = srt + vid
varprops = variables.setdefault(var, {})
for prop, val in _decode_extrapairs(elem.iter('extrapair')):
varprops[prop] = val
return var
def _decode_extrapairs(elems):
# <!ELEMENT extrapair (path,value)>
# <!ELEMENT path (#PCDATA)>
# <!ELEMENT value (#PCDATA)>
return [(e.find('path').text.upper(), e.find('value').text.lower())
for e in elems]
def _decode_ep(elem, variables=None):
# <!ELEMENT ep ((pred|spred|realpred), label, fvpair*)>
# <!ATTLIST ep
# cfrom CDATA #IMPLIED
# cto CDATA #IMPLIED
# surface CDATA #IMPLIED
# base CDATA #IMPLIED >
args = _decode_args(elem, variables=variables)
return EP(_decode_pred(elem.find('./')),
_decode_label(elem.find('label')),
args=args,
lnk=_decode_lnk(elem.get('cfrom'), elem.get('cto')),
surface=elem.get('surface'),
base=elem.get('base'))
def _decode_pred(elem):
# <!ELEMENT pred (#PCDATA)>
# <!ELEMENT spred (#PCDATA)>
# <!ELEMENT realpred EMPTY>
# <!ATTLIST realpred
# lemma CDATA #REQUIRED
# pos (v|n|j|r|p|q|c|x|u|a|s) #REQUIRED
# sense CDATA #IMPLIED >
if elem.tag in ('pred', 'spred'):
return elem.text
elif elem.tag == 'realpred':
return predicate.create(elem.get('lemma'),
elem.get('pos'),
elem.get('sense'))
def _decode_args(elem, variables=None):
# <!ELEMENT fvpair (rargname, (var|constant))>
# This code assumes that only cargs have constant values, and all
# other args (including IVs) have var values.
args = {}
for e in elem.findall('fvpair'):
rargname = e.find('rargname').text.upper()
if e.find('constant') is not None:
argval = e.find('constant').text
elif e.find('var') is not None:
argval = _decode_var(e.find('var'), variables=variables)
args[rargname] = argval
return args
def _decode_hcons(elem, variables):
# <!ELEMENT hcons (hi, lo)>
# <!ATTLIST hcons
# hreln (qeq|lheq|outscopes) #REQUIRED >
# <!ELEMENT hi (var)>
# <!ELEMENT lo (label|var)>
hi = _decode_var(elem.find('hi/var'), variables)
lo = elem.find('lo/')
if lo.tag == 'var':
lo = _decode_var(lo, variables)
else:
lo = _decode_label(lo)
return HCons(hi, elem.get('hreln'), lo)
# this isn't part of the spec; just putting here in case it's added later
def _decode_icons(elem, variables):
# <!ELEMENT icons (left, right)>
# <!ATTLIST icons
# ireln #REQUIRED >
# <!ELEMENT left (var)>
# <!ELEMENT right (var)>
return ICons(_decode_var(elem.find('left/var'), variables),
elem.get('ireln'),
_decode_var(elem.find('right/var'), variables))
def _decode_lnk(cfrom, cto):
if cfrom is cto is None:
return None
elif None in (cfrom, cto):
raise ValueError('Both cfrom and cto, or neither, must be specified.')
else:
return Lnk.charspan(cfrom, cto)
##############################################################################
##############################################################################
# Encoding
def _encode(ms, properties, lnk):
e = etree.Element('mrs-list')
for m in ms:
e.append(_encode_mrs(m, properties, lnk))
return e
def _encode_mrs(m, properties, lnk):
if properties:
varprops = dict(m.variables)
else:
varprops = {}
attributes = {}
if lnk:
attributes['cfrom'] = str(m.cfrom)
attributes['cto'] = str(m.cto)
if m.surface is not None:
attributes['surface'] = m.surface
if m.identifier is not None:
attributes['ident'] = m.identifier
e = etree.Element('mrs', attrib=attributes)
if m.top is not None:
e.append(_encode_label(m.top))
if m.index is not None:
e.append(_encode_variable(m.index, varprops))
for ep in m.rels:
e.append(_encode_ep(ep, varprops, lnk))
for hc in m.hcons:
e.append(_encode_hcon(hc, varprops))
for ic in m.icons:
e.append(_encode_icon(ic, varprops))
return e
def _encode_label(label):
_, vid = variable.split(label)
return etree.Element('label', vid=vid)
def _encode_variable(v, varprops):
srt, vid = variable.split(v)
var = etree.Element('var', vid=vid, sort=srt)
if varprops.get(v):
for key in sorted(varprops[v], key=property_priority):
val = varprops[v][key]
var.append(_encode_extrapair(key, val))
del varprops[v]
return var
def _encode_extrapair(key, value):
extrapair = etree.Element('extrapair')
path = etree.Element('path')
path.text = key
val = etree.Element('value')
val.text = value
extrapair.extend([path, val])
return extrapair
def _encode_ep(ep, varprops, lnk):
attributes = {}
if lnk:
attributes['cfrom'] = str(ep.cfrom)
attributes['cto'] = str(ep.cto)
if ep.surface:
attributes['surface'] = ep.surface
if ep.base:
attributes['base'] = ep.base
e = etree.Element('ep', attrib=attributes)
e.append(_encode_pred(ep.predicate))
e.append(_encode_label(ep.label))
for role in sorted(ep.args, key=role_priority):
val = ep.args[role]
if role == CONSTANT_ROLE:
e.append(_encode_arg(CONSTANT_ROLE, _encode_constant(val)))
else:
e.append(_encode_arg(role, _encode_variable(val, varprops)))
return e
def _encode_pred(pred):
p = None
if predicate.is_surface(pred):
lemma, pos, sense = predicate.split(pred)
attributes = {'lemma': lemma, 'pos': pos}
if sense is not None:
attributes['sense'] = sense
p = etree.Element('realpred', attrib=attributes)
elif predicate.is_abstract(pred):
p = etree.Element('pred')
p.text = pred
else:
p = etree.Element('spred')
p.text = pred
return p
def _encode_arg(key, value):
fvpair = etree.Element('fvpair')
rargname = etree.Element('rargname')
rargname.text = key
fvpair.append(rargname)
fvpair.append(value)
return fvpair
def _encode_constant(value):
const = etree.Element('constant')
const.text = value
return const
def _encode_hcon(hcon, varprops):
hcons_ = etree.Element('hcons', hreln=hcon.relation)
hi = etree.Element('hi')
hi.append(_encode_variable(hcon.hi, varprops))
lo = etree.Element('lo')
lo.append(_encode_label(hcon.lo))
hcons_.extend([hi, lo])
return hcons_
def _encode_icon(icon, varprops):
icons_ = etree.Element('icons', ireln=icon.relation)
left = etree.Element('left')
left.append(_encode_variable(icon.left, varprops))
right = etree.Element('right')
right.append(_encode_variable(icon.right, varprops))
icons_.extend([left, right])
return icons_
def _tostring(e, indent, offset):
string = etree.tostring(e, encoding='unicode')
if indent is not None and indent is not False:
if indent is True:
indent = 0
def indentmatch(m):
return '\n' + (' ' * indent * (m.lastindex + offset)) + m.group()
string = re.sub(
r'(</mrs-list>)'
r'|(<mrs[^-]|</mrs>)'
r'|(<ep[>\s]|<fvpair>|<extrapair>|<hcons\s|<icons\s>)',
indentmatch,
string)
return string.strip()