Source code for delphin.codecs.dmrx

"""
DMRX (XML for DMRS) serialization and deserialization.
"""

import xml.etree.ElementTree as etree
from pathlib import Path

from delphin import predicate
from delphin.dmrs import CVARSORT, DMRS, Link, Node
from delphin.lnk import Lnk

CODEC_INFO = {
    'representation': 'dmrs',
}

HEADER = '<dmrs-list>'
JOINER = ''
FOOTER = '</dmrs-list>'


##############################################################################
##############################################################################
# Pickle-API methods


[docs] def load(source): """ Deserialize DMRX from a file (handle or filename) Args: source (str, file): input filename or file object Returns: a list of DMRS objects """ if not hasattr(source, 'read'): source = str(Path(source).expanduser()) ms = _decode(source) return list(ms)
[docs] def loads(s): """ Deserialize DMRX string representations Args: s (str): a DMRX string Returns: a list of DMRS objects """ corpus = etree.fromstring(s) ds = (_decode_dmrs(dmrs_elem) for dmrs_elem in corpus) return list(ds)
[docs] def dump(ds, destination, properties=True, lnk=True, indent=False, encoding='utf-8'): """ Serialize DMRS objects to DMRX and write to a file Args: ds: an iterator of DMRS objects to serialize destination: filename or file object where data will be written properties: if `False`, suppress morphosemantic properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation encoding (str): if *destination* is a filename, write to the file with the given encoding; otherwise it is ignored """ text = dumps(ds, properties=properties, lnk=lnk, indent=indent) if hasattr(destination, 'write'): print(text, file=destination) else: destination = Path(destination).expanduser() with destination.open('w', encoding=encoding) as fh: print(text, file=fh)
[docs] def dumps(ds, properties=True, lnk=True, indent=False): """ Serialize DMRS objects to a DMRX representation Args: ds: an iterator of DMRS objects to serialize properties: if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: a DMRX string representation of a corpus of DMRS objects """ return _encode(ds, properties, lnk, indent)
[docs] def decode(s): """ Deserialize a DMRS object from a DMRX string. Note: This does not expect the top-level <dmrs-list> element. """ elem = etree.fromstring(s) return _decode_dmrs(elem)
[docs] def encode(d, properties=True, lnk=True, indent=False): """ Serialize a DMRS object to a DMRX string. Note: This does not include the top-level <dmrs-list> element, so it is not valid with the DMRX schema, but it may be more useful if you work with single DMRX objects at a time rather than lists of them. Args: d: a DMRS object properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: a DMRX-serialization of the DMRS object """ elem = _encode_dmrs(d, properties, lnk) if indent is True or indent in ('LKB', 'Lkb', 'lkb'): _indent(elem, indent=0, maxdepth=2, level=0) elif indent is not False and indent is not None: _indent(elem, indent, maxdepth=3, level=0) s = etree.tostring(elem, encoding='unicode').rstrip() return s
############################################################################## ############################################################################## # Decoding def _decode(fh): # <!ELEMENT dmrs-list (dmrs)*> # if memory becomes a big problem, consider catching start events, # get the root element (later start events can be ignored), and # root.clear() after decoding each mrs for _, elem in etree.iterparse(fh, events=('end',)): if elem.tag == 'dmrs': yield _decode_dmrs(elem) elem.clear() def _decode_dmrs(elem): # <!ELEMENT dmrs (node|link)*> # <!ATTLIST dmrs # cfrom CDATA #REQUIRED # cto CDATA #REQUIRED # surface CDATA #IMPLIED # ident CDATA #IMPLIED > elem = elem.find('.') # in case elem is an ElementTree rather than Element return DMRS(top=elem.get('top'), index=elem.get('index'), nodes=list(map(_decode_node, elem.iter('node'))), links=list(map(_decode_link, elem.iter('link'))), lnk=_decode_lnk(elem), surface=elem.get('surface'), identifier=elem.get('ident')) def _decode_node(elem): # <!ELEMENT node ((realpred|gpred), sortinfo)> # <!ATTLIST node # nodeid CDATA #REQUIRED # cfrom CDATA #REQUIRED # cto CDATA #REQUIRED # surface CDATA #IMPLIED # base CDATA #IMPLIED # carg CDATA #IMPLIED > sortinfo = _decode_sortinfo(elem.find('sortinfo')) type = None if CVARSORT in sortinfo: type = sortinfo.pop(CVARSORT).lower() return Node(id=int(elem.get('nodeid')), predicate=_decode_pred(elem.find('*[1]')), type=type, properties=sortinfo, # without cvarsort; see above lnk=_decode_lnk(elem), surface=elem.get('surface'), base=elem.get('base'), carg=elem.get('carg')) def _decode_pred(elem): # <!ELEMENT realpred EMPTY> # <!ATTLIST realpred # lemma CDATA #REQUIRED # pos (v|n|j|r|p|q|c|x|u|a|s) #REQUIRED # sense CDATA #IMPLIED > # <!ELEMENT gpred (#PCDATA)> if elem.tag == 'gpred': pred = elem.text elif elem.tag == 'realpred': pred = predicate.create(elem.get('lemma'), elem.get('pos'), elem.get('sense')) return predicate.normalize(pred) def _decode_sortinfo(elem): # <!ELEMENT sortinfo EMPTY> # <!ATTLIST sortinfo # cvarsort (x|e|i|u) #IMPLIED # num (sg|pl|u) #IMPLIED # pers (1|2|3|1-or-3|u) #IMPLIED # gend (m|f|n|m-or-f|u) #IMPLIED # sf (prop|ques|comm|prop-or-ques|u) #IMPLIED # tense (past|pres|fut|tensed|untensed|u) #IMPLIED # mood (indicative|subjunctive|u) #IMPLIED # prontype (std_pron|zero_pron|refl|u) #IMPLIED # prog (plus|minus|u) #IMPLIED # perf (plus|minus|u) #IMPLIED # ind (plus|minus|u) #IMPLIED > # note: Just accept any properties, since these are ERG-specific return {(key.upper() if key != CVARSORT else key): val.lower() for key, val in elem.attrib.items()} def _decode_link(elem): # <!ELEMENT link (rargname, post)> # <!ATTLIST link # from CDATA #REQUIRED # to CDATA #REQUIRED > # <!ELEMENT rargname (#PCDATA)> # <!ELEMENT post (#PCDATA)> return Link(start=int(elem.get('from')), end=int(elem.get('to')), role=getattr(elem.find('rargname'), 'text', None), post=getattr(elem.find('post'), 'text', None)) def _decode_lnk(elem): return Lnk.charspan(elem.get('cfrom', '-1'), elem.get('cto', '-1')) ############################################################################## ############################################################################## # Encoding def _encode(ds, properties, lnk, indent): e = etree.Element('dmrs-list') for d in ds: e.append(_encode_dmrs(d, properties, lnk)) if indent is True or indent in ('LKB', 'Lkb', 'lkb'): _indent(e, indent=0, maxdepth=3, level=0) elif indent is not False and indent is not None: _indent(e, indent, maxdepth=4, level=0) return etree.tostring(e, encoding='unicode').rstrip() def _encode_dmrs(d, properties, lnk): attributes = {} if lnk: attributes['cfrom'] = str(d.cfrom) attributes['cto'] = str(d.cto) if d.top is not None: attributes['top'] = str(d.top) if d.index is not None: attributes['index'] = str(d.index) if lnk and d.surface is not None: attributes['surface'] = d.surface if d.identifier is not None: attributes['ident'] = d.identifier e = etree.Element('dmrs', attrib=attributes) for node in d.nodes: e.append(_encode_node(node, properties, lnk)) for link in d.links: e.append(_encode_link(link)) return e def _encode_node(node, properties, lnk): attributes = {'nodeid': str(node.id)} if lnk: attributes['cfrom'] = str(node.cfrom) attributes['cto'] = str(node.cto) if node.surface is not None: attributes['surface'] = node.surface if node.base is not None: attributes['base'] = node.base if node.carg is not None: attributes['carg'] = node.carg e = etree.Element('node', attrib=attributes) e.append(_encode_pred(node.predicate)) if properties: sortinfo = {key.lower(): val.lower() for key, val in node.sortinfo.items()} else: sortinfo = {} e.append(etree.Element('sortinfo', attrib=sortinfo)) return e def _encode_pred(pred): pred = predicate.normalize(pred) if predicate.is_surface(pred): lemma, pos, sense = predicate.split(pred) attributes = {'lemma': lemma, 'pos': pos} if sense: attributes['sense'] = sense e = etree.Element('realpred', attrib=attributes) else: e = etree.Element('gpred') e.text = pred return e def _encode_link(link): e = etree.Element('link', attrib={'from': str(link.start), 'to': str(link.end)}) rargname = etree.Element('rargname') rargname.text = link.role post = etree.Element('post') post.text = link.post e.append(rargname) e.append(post) return e # inspired by Fredrik Lundh's indent() function: # http://effbot.org/zone/element-lib.htm def _indent(elem, indent, maxdepth, level): if level == maxdepth: return curind = '\n' + ' ' * indent * level nxtind = '\n' + ' ' * indent * (level + 1) if len(elem): if not elem.text and level + 1 < maxdepth: elem.text = nxtind elem.tail = curind for subelem in elem: _indent(subelem, indent, maxdepth, level + 1) if level + 1 < maxdepth: elem.tail = curind else: elem.tail = '' else: elem.tail = curind