"""
DMRX (XML for DMRS) serialization and deserialization.
"""
import xml.etree.ElementTree as etree
from pathlib import Path
from delphin import predicate
from delphin.dmrs import CVARSORT, DMRS, Link, Node
from delphin.lnk import Lnk
CODEC_INFO = {
"representation": "dmrs",
}
HEADER = "<dmrs-list>"
JOINER = ""
FOOTER = "</dmrs-list>"
##############################################################################
##############################################################################
# Pickle-API methods
[docs]
def load(source):
"""
Deserialize DMRX from a file (handle or filename)
Args:
source (str, file): input filename or file object
Returns:
a list of DMRS objects
"""
if not hasattr(source, "read"):
source = str(Path(source).expanduser())
ms = _decode(source)
return list(ms)
[docs]
def loads(s):
"""
Deserialize DMRX string representations
Args:
s (str): a DMRX string
Returns:
a list of DMRS objects
"""
corpus = etree.fromstring(s)
ds = (_decode_dmrs(dmrs_elem) for dmrs_elem in corpus)
return list(ds)
[docs]
def dump(ds, destination, properties=True, lnk=True, indent=False, encoding="utf-8"):
"""
Serialize DMRS objects to DMRX and write to a file
Args:
ds: an iterator of DMRS objects to serialize
destination: filename or file object where data will be written
properties: if `False`, suppress morphosemantic properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
encoding (str): if *destination* is a filename, write to the
file with the given encoding; otherwise it is ignored
"""
text = dumps(ds, properties=properties, lnk=lnk, indent=indent)
if hasattr(destination, "write"):
print(text, file=destination)
else:
destination = Path(destination).expanduser()
with destination.open("w", encoding=encoding) as fh:
print(text, file=fh)
[docs]
def dumps(ds, properties=True, lnk=True, indent=False):
"""
Serialize DMRS objects to a DMRX representation
Args:
ds: an iterator of DMRS objects to serialize
properties: if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
a DMRX string representation of a corpus of DMRS objects
"""
return _encode(ds, properties, lnk, indent)
[docs]
def decode(s):
"""
Deserialize a DMRS object from a DMRX string.
Note:
This does not expect the top-level <dmrs-list> element.
"""
elem = etree.fromstring(s)
return _decode_dmrs(elem)
[docs]
def encode(d, properties=True, lnk=True, indent=False):
"""
Serialize a DMRS object to a DMRX string.
Note:
This does not include the top-level <dmrs-list> element, so it
is not valid with the DMRX schema, but it may be more useful
if you work with single DMRX objects at a time rather than
lists of them.
Args:
d: a DMRS object
properties (bool): if `False`, suppress variable properties
lnk: if `False`, suppress surface alignments and strings
indent (bool, int): if `True` or an integer value, add
newlines and indentation
Returns:
a DMRX-serialization of the DMRS object
"""
elem = _encode_dmrs(d, properties, lnk)
if indent is True or indent in ("LKB", "Lkb", "lkb"):
_indent(elem, indent=0, maxdepth=2, level=0)
elif indent is not False and indent is not None:
_indent(elem, indent, maxdepth=3, level=0)
s = etree.tostring(elem, encoding="unicode").rstrip()
return s
##############################################################################
##############################################################################
# Decoding
def _decode(fh):
"""
<!ELEMENT dmrs-list (dmrs)*>
if memory becomes a big problem, consider catching start events,
get the root element (later start events can be ignored), and
root.clear() after decoding each mrs
"""
for _, elem in etree.iterparse(fh, events=("end",)):
if elem.tag == "dmrs":
yield _decode_dmrs(elem)
elem.clear()
def _decode_dmrs(elem):
"""
<!ELEMENT dmrs (node|link)*>
<!ATTLIST dmrs
cfrom CDATA #REQUIRED
cto CDATA #REQUIRED
surface CDATA #IMPLIED
ident CDATA #IMPLIED >
"""
elem = elem.find(".") # in case elem is an ElementTree rather than Element
return DMRS(
top=elem.get("top"),
index=elem.get("index"),
nodes=list(map(_decode_node, elem.iter("node"))),
links=list(map(_decode_link, elem.iter("link"))),
lnk=_decode_lnk(elem),
surface=elem.get("surface"),
identifier=elem.get("ident"),
)
def _decode_node(elem):
"""
<!ELEMENT node ((realpred|gpred), sortinfo)>
<!ATTLIST node
nodeid CDATA #REQUIRED
cfrom CDATA #REQUIRED
cto CDATA #REQUIRED
surface CDATA #IMPLIED
base CDATA #IMPLIED
carg CDATA #IMPLIED >
"""
sortinfo = _decode_sortinfo(elem.find("sortinfo"))
type = None
if CVARSORT in sortinfo:
type = sortinfo.pop(CVARSORT).lower()
return Node(
id=int(elem.get("nodeid")),
predicate=_decode_pred(elem.find("*[1]")),
type=type,
properties=sortinfo, # without cvarsort; see above
lnk=_decode_lnk(elem),
surface=elem.get("surface"),
base=elem.get("base"),
carg=elem.get("carg"),
)
def _decode_pred(elem):
"""
<!ELEMENT realpred EMPTY>
<!ATTLIST realpred
lemma CDATA #REQUIRED
pos (v|n|j|r|p|q|c|x|u|a|s) #REQUIRED
sense CDATA #IMPLIED >
<!ELEMENT gpred (#PCDATA)>
"""
if elem.tag == "gpred":
pred = elem.text
elif elem.tag == "realpred":
pred = predicate.create(elem.get("lemma"), elem.get("pos"), elem.get("sense"))
return predicate.normalize(pred)
def _decode_sortinfo(elem):
"""
<!ELEMENT sortinfo EMPTY>
<!ATTLIST sortinfo
cvarsort (x|e|i|u) #IMPLIED
num (sg|pl|u) #IMPLIED
pers (1|2|3|1-or-3|u) #IMPLIED
gend (m|f|n|m-or-f|u) #IMPLIED
sf (prop|ques|comm|prop-or-ques|u) #IMPLIED
tense (past|pres|fut|tensed|untensed|u) #IMPLIED
mood (indicative|subjunctive|u) #IMPLIED
prontype (std_pron|zero_pron|refl|u) #IMPLIED
prog (plus|minus|u) #IMPLIED
perf (plus|minus|u) #IMPLIED
ind (plus|minus|u) #IMPLIED >
note: Just accept any properties, since these are ERG-specific
"""
return {
(key.upper() if key != CVARSORT else key): val.lower()
for key, val in elem.attrib.items()
}
def _decode_link(elem):
"""
<!ELEMENT link (rargname, post)>
<!ATTLIST link
from CDATA #REQUIRED
to CDATA #REQUIRED >
<!ELEMENT rargname (#PCDATA)>
<!ELEMENT post (#PCDATA)>
"""
return Link(
start=int(elem.get("from")),
end=int(elem.get("to")),
role=getattr(elem.find("rargname"), "text", None),
post=getattr(elem.find("post"), "text", None),
)
def _decode_lnk(elem):
return Lnk.charspan(elem.get("cfrom", "-1"), elem.get("cto", "-1"))
##############################################################################
##############################################################################
# Encoding
def _encode(ds, properties, lnk, indent):
e = etree.Element("dmrs-list")
for d in ds:
e.append(_encode_dmrs(d, properties, lnk))
if indent is True or indent in ("LKB", "Lkb", "lkb"):
_indent(e, indent=0, maxdepth=3, level=0)
elif indent is not False and indent is not None:
_indent(e, indent, maxdepth=4, level=0)
return etree.tostring(e, encoding="unicode").rstrip()
def _encode_dmrs(d, properties, lnk):
attributes = {}
if lnk:
attributes["cfrom"] = str(d.cfrom)
attributes["cto"] = str(d.cto)
if d.top is not None:
attributes["top"] = str(d.top)
if d.index is not None:
attributes["index"] = str(d.index)
if lnk and d.surface is not None:
attributes["surface"] = d.surface
if d.identifier is not None:
attributes["ident"] = d.identifier
e = etree.Element("dmrs", attrib=attributes)
for node in d.nodes:
e.append(_encode_node(node, properties, lnk))
for link in d.links:
e.append(_encode_link(link))
return e
def _encode_node(node, properties, lnk):
attributes = {"nodeid": str(node.id)}
if lnk:
attributes["cfrom"] = str(node.cfrom)
attributes["cto"] = str(node.cto)
if node.surface is not None:
attributes["surface"] = node.surface
if node.base is not None:
attributes["base"] = node.base
if node.carg is not None:
attributes["carg"] = node.carg
e = etree.Element("node", attrib=attributes)
e.append(_encode_pred(node.predicate))
if properties:
sortinfo = {key.lower(): val.lower() for key, val in node.sortinfo.items()}
else:
sortinfo = {}
e.append(etree.Element("sortinfo", attrib=sortinfo))
return e
def _encode_pred(pred):
pred = predicate.normalize(pred)
if predicate.is_surface(pred):
lemma, pos, sense = predicate.split(pred)
attributes = {"lemma": lemma, "pos": pos}
if sense:
attributes["sense"] = sense
e = etree.Element("realpred", attrib=attributes)
else:
e = etree.Element("gpred")
e.text = pred
return e
def _encode_link(link):
e = etree.Element("link", attrib={"from": str(link.start), "to": str(link.end)})
rargname = etree.Element("rargname")
rargname.text = link.role
post = etree.Element("post")
post.text = link.post
e.append(rargname)
e.append(post)
return e
# inspired by Fredrik Lundh's indent() function:
# http://effbot.org/zone/element-lib.htm
def _indent(elem, indent, maxdepth, level):
if level == maxdepth:
return
curind = "\n" + " " * indent * level
nxtind = "\n" + " " * indent * (level + 1)
if len(elem):
if not elem.text and level + 1 < maxdepth:
elem.text = nxtind
elem.tail = curind
for subelem in elem:
_indent(subelem, indent, maxdepth, level + 1)
if level + 1 < maxdepth:
elem.tail = curind
else:
elem.tail = ""
else:
elem.tail = curind