"""
PyDelphin API counterparts to the `delphin` commands.
The public functions in this module largely mirror the front-end
subcommands provided by the `delphin` command, with some small changes
to argument names or values to be better-suited to being called from
within Python.
"""
import sys
import os
import io
import json
from functools import partial
import logging
from delphin import itsdb, tsql
from delphin.mrs import xmrs
from delphin.util import safe_int, SExpr
from delphin.exceptions import PyDelphinException
logging.basicConfig()
###############################################################################
### CONVERT ###################################################################
[docs]def convert(path, source_fmt, target_fmt, select='result:mrs',
properties=True, show_status=False, predicate_modifiers=False,
color=False, pretty_print=False, indent=None):
"""
Convert between various DELPH-IN Semantics representations.
Args:
path (str, file): filename, testsuite directory, open file, or
stream of input representations
source_fmt (str): convert from this format
target_fmt (str): convert to this format
select (str): TSQL query for selecting data (ignored if *path*
is not a testsuite directory; default: `"result:mrs"`)
properties (bool): include morphosemantic properties if `True`
(default: `True`)
show_status (bool): show disconnected EDS nodes (ignored if
*target_fmt* is not `"eds"`; default: `False`)
predicate_modifiers (bool): apply EDS predicate modification
for certain kinds of patterns (ignored if *target_fmt* is
not an EDS format; default: `False`)
color (bool): apply syntax highlighting if `True` and
*target_fmt* is `"simplemrs"` (default: `False`)
pretty_print (bool): if `True`, format the output with
newlines and default indentation (default: `False`)
indent (int, optional): specifies an explicit number of spaces
for indentation (implies *pretty_print*)
Returns:
str: the converted representation
"""
if source_fmt.startswith('eds') and not target_fmt.startswith('eds'):
raise ValueError(
'Conversion from EDS to non-EDS currently not supported.')
if indent:
pretty_print = True
indent = 4 if indent is True else safe_int(indent)
if len(tsql.inspect_query('select ' + select)['projection']) != 1:
raise ValueError('Exactly 1 column must be given in selection query: '
'(e.g., result:mrs)')
# read
loads = _get_codec(source_fmt)
if path is None:
xs = loads(sys.stdin.read())
elif hasattr(path, 'read'):
xs = loads(path.read())
elif os.path.isdir(path):
ts = itsdb.TestSuite(path)
xs = [
next(iter(loads(r[0])), None)
for r in tsql.select(select, ts)
]
else:
xs = loads(open(path, 'r').read())
# write
dumps = _get_codec(target_fmt, load=False)
kwargs = {}
if color: kwargs['color'] = color
if pretty_print: kwargs['pretty_print'] = pretty_print
if indent: kwargs['indent'] = indent
if target_fmt == 'eds':
kwargs['pretty_print'] = pretty_print
kwargs['show_status'] = show_status
if target_fmt.startswith('eds'):
kwargs['predicate_modifiers'] = predicate_modifiers
kwargs['properties'] = properties
# this is not a great way to improve robustness when converting
# many representations, but it'll do until v1.0.0. Also, it only
# improves robustness on the output, not the input.
# Note that all the code below is to replace the following:
# return dumps(xs, **kwargs)
head, joiner, tail = _get_output_details(target_fmt)
parts = []
if pretty_print:
joiner = joiner.strip() + '\n'
def _trim(s):
if head and s.startswith(head):
s = s[len(head):].lstrip('\n')
if tail and s.endswith(tail):
s = s[:-len(tail)].rstrip('\n')
return s
for x in xs:
try:
s = dumps([x], **kwargs)
except (PyDelphinException, KeyError, IndexError):
logging.exception('could not convert representation')
else:
s = _trim(s)
parts.append(s)
# set these after so head and tail are used correctly in _trim
if pretty_print:
if head:
head += '\n'
if tail:
tail = '\n' + tail
return head + joiner.join(parts) + tail
def _get_codec(codec, load=True):
if codec == 'simplemrs':
from delphin.mrs import simplemrs
return simplemrs.loads if load else simplemrs.dumps
elif codec == 'ace' and load:
return _read_ace_parse
elif codec == 'mrx':
from delphin.mrs import mrx
return mrx.loads if load else mrx.dumps
elif codec == 'mrs-prolog' and not load:
from delphin.mrs import prolog
return prolog.dumps
elif codec == 'dmrx':
from delphin.mrs import dmrx
return dmrx.loads if load else dmrx.dumps
elif codec == 'simpledmrs' and not load:
from delphin.mrs import simpledmrs
return simpledmrs.dumps
elif codec == 'dmrs-tikz' and not load:
from delphin.extra import latex
return latex.dmrs_tikz_dependency
elif codec in ('mrs-json', 'dmrs-json', 'eds-json'):
cls = {'mrs-json': _MRS_JSON,
'dmrs-json': _DMRS_JSON,
'eds-json': _EDS_JSON}[codec]
return cls().loads if load else cls().dumps
elif codec in ('dmrs-penman', 'eds-penman'):
if codec == 'dmrs-penman':
model = xmrs.Dmrs
elif codec == 'eds-penman':
from delphin.mrs.eds import Eds as model
func = _penman_loads if load else _penman_dumps
return partial(func, model=model)
elif codec == 'eds':
from delphin.mrs import eds
return eds.loads if load else eds.dumps
elif load:
raise ValueError('invalid source format: ' + codec)
else:
raise ValueError('invalid target format: ' + codec)
def _get_output_details(codec):
if codec == 'mrx':
return ('<mrs-list', '', '</mrs-list>')
elif codec == 'dmrx':
from delphin.mrs import dmrx
return ('<dmrs-list>', '', '</dmrs-list>')
elif codec in ('mrs-json', 'dmrs-json', 'eds-json'):
return ('[', ',', ']')
else:
return ('', ' ', '')
# simulate json codecs for MRS and DMRS
class _MRS_JSON(object):
CLS = xmrs.Mrs
def getlist(self, o):
if isinstance(o, dict):
return [o]
else:
return o
def load(self, f):
return [self.CLS.from_dict(d) for d in self.getlist(json.load(f))]
def loads(self, s):
return [self.CLS.from_dict(d) for d in self.getlist(json.loads(s))]
def dumps(self,
xs,
properties=True,
pretty_print=False,
indent=None,
**kwargs):
if pretty_print and indent is None:
indent = 2
return json.dumps(
[
self.CLS.to_dict(
(x if isinstance(x, self.CLS)
else self.CLS.from_xmrs(x, **kwargs)),
properties=properties) for x in xs
],
indent=indent)
class _DMRS_JSON(_MRS_JSON):
CLS = xmrs.Dmrs
class _EDS_JSON(_MRS_JSON):
from delphin.mrs import eds
CLS = eds.Eds
# load Penman module on demand
def _penman_loads(s, model=None, **kwargs):
from delphin.mrs import penman
return penman.loads(s, model=model, **kwargs)
def _penman_dumps(xs, model=None, **kwargs):
from delphin.mrs import penman
strings = []
for x in xs:
try:
strings.append(penman.dumps([x], model=model, **kwargs))
except penman.penman.EncodeError:
logging.error('Invalid graph; possibly disconnected')
strings.append('')
return '\n'.join(strings)
# read simplemrs from ACE output
def _read_ace_parse(s):
from delphin.mrs import simplemrs
if hasattr(s, 'decode'):
s = s.decode('utf-8')
surface = None
newline = False
for line in s.splitlines():
if line.startswith('SENT: '):
surface = line[6:]
# regular ACE output
elif line.startswith('['):
m = line.partition(' ; ')[0].strip()
m = simplemrs.loads(m, single=True)
m.surface = surface
yield m
# with --tsdb-stdout
elif line.startswith('('):
while line:
expr = SExpr.parse(line)
line = expr.remainder.lstrip()
if len(expr.data) == 2 and expr.data[0] == ':results':
for result in expr.data[1]:
for key, val in result:
if key == ':mrs':
yield simplemrs.loads(val, single=True)
elif line == '\n':
if newline:
surface = None
newline = False
else:
newline = True
else:
pass
###############################################################################
### SELECT ####################################################################
[docs]def select(dataspec, testsuite, mode='list', cast=True):
"""
Select data from [incr tsdb()] profiles.
Args:
query (str): TSQL select query (e.g., `'i-id i-input mrs'` or
`'* from item where readings > 0'`)
testsuite (str, TestSuite): testsuite or path to testsuite
containing data to select
mode (str): see :func:`delphin.itsdb.select_rows` for a
description of the *mode* parameter (default: `list`)
cast (bool): if `True`, cast column values to their datatype
according to the relations file (default: `True`)
Returns:
a generator that yields selected data
"""
if isinstance(testsuite, itsdb.ItsdbProfile):
testsuite = itsdb.TestSuite(testsuite.root)
elif not isinstance(testsuite, itsdb.TestSuite):
testsuite = itsdb.TestSuite(testsuite)
return tsql.select(dataspec, testsuite, mode=mode, cast=cast)
###############################################################################
### MKPROF ####################################################################
[docs]def mkprof(destination, source=None, relations=None, where=None,
in_place=False, skeleton=False, full=False, gzip=False):
"""
Create [incr tsdb()] profiles or skeletons.
Data for the testsuite may come from an existing testsuite or from
a list of sentences. There are four main usage patterns:
- `source="testsuite/"` -- read data from `testsuite/`
- `source=None, in_place=True` -- read data from *destination*
- `source=None, in_place=False` -- read sentences from stdin
- `source="sents.txt"` -- read sentences from `sents.txt`
For the latter two, the *relations* parameter must be specified.
Args:
destination (str): path of the new testsuite
source (str): path to a source testsuite or a file containing
sentences; if not given and *in_place* is `False`,
sentences are read from stdin
relations (str): path to a relations file to use for the
created testsuite; if `None` and *source* is given, the
relations file of the source testsuite is used
where (str): TSQL condition to filter records by; ignored if
*source* is not a testsuite
in_place (bool): if `True` and *source* is not given, use
*destination* as the source for data (default: `False`)
skeleton (bool): if `True`, only write tsdb-core files
(default: `False`)
full (bool): if `True`, copy all data from the source
testsuite (requires *source* to be a testsuite path;
default: `False`)
gzip (bool): if `True`, non-empty tables will be compressed
with gzip
"""
# basic validation
if skeleton and full:
raise ValueError("'skeleton' is incompatible with 'full'")
elif skeleton and in_place:
raise ValueError("'skeleton' is incompatible with 'in_place'")
elif in_place and source is not None:
raise ValueError("'in_place' is incompatible with 'source'")
if in_place:
source = destination
if full and (source is None or not os.path.isdir(source)):
raise ValueError("'full' must be used with a source testsuite")
if relations is None and source is not None and os.path.isdir(source):
relations = os.path.join(source, 'relations')
elif relations is None or not os.path.isfile(relations):
raise ValueError('invalid or missing relations file: {}'
.format(relations))
# setup destination testsuite
_prepare_output_directory(destination)
dts = itsdb.TestSuite(path=destination, relations=relations)
# input is sentences on stdin
if source is None:
dts.write({'item': _lines_to_rows(sys.stdin, dts.relations)},
gzip=gzip)
# input is sentence file
elif os.path.isfile(source):
with open(source) as fh:
dts.write({'item': _lines_to_rows(fh, dts.relations)},
gzip=gzip)
# input is source testsuite
elif os.path.isdir(source):
sts = itsdb.TestSuite(source)
tables = dts.relations.tables if full else itsdb.tsdb_core_files
where = '' if where is None else 'where ' + where
for table in tables:
if sts.size(table) > 0:
# filter the data, but use all if the query fails
# (e.g., if the filter and table cannot be joined)
try:
rows = tsql.select(
'* from {} {}'.format(table, where), sts, cast=False)
except itsdb.ItsdbError:
rows = sts[table]
dts.write({table: rows}, gzip=gzip)
dts.reload()
# unless a skeleton was requested, make empty files for other tables
if not skeleton:
for table in dts.relations:
if len(dts[table]) == 0:
dts.write({table: []})
# summarize what was done
if sys.stdout.isatty():
_red = lambda s: '\x1b[1;31m{}\x1b[0m'.format(s)
else:
_red = lambda s: s
fmt = '{:>8} bytes\t{}'
for filename in ['relations'] + list(dts.relations.tables):
path = os.path.join(destination, filename)
if os.path.isfile(path):
stat = os.stat(path)
print(fmt.format(stat.st_size, filename))
elif os.path.isfile(path + '.gz'):
stat = os.stat(path + '.gz')
print(fmt.format(stat.st_size, _red(filename + '.gz')))
def _lines_to_rows(lines, relations):
# field indices only need to be computed once, so don't use
# itsdb.Record.from_dict()
i_id_idx = relations['item'].index('i-id')
i_wf_idx = relations['item'].index('i-wf')
i_input_idx = relations['item'].index('i-input')
num_fields = len(relations['item'])
def make_row(i_id, i_wf, i_input):
row = [None] * num_fields
row[i_id_idx] = i_id
row[i_wf_idx] = i_wf
row[i_input_idx] = i_input
return itsdb.Record(relations['item'], row)
for i, line in enumerate(lines):
i_wf, i_input = (0, line[1:]) if line.startswith('*') else (1, line)
yield make_row(i * 10, i_wf, i_input.strip())
###############################################################################
### PROCESS ###################################################################
[docs]def process(grammar, testsuite, source=None, select=None,
generate=False, transfer=False, options=None,
all_items=False, result_id=None, gzip=False):
"""
Process (e.g., parse) a [incr tsdb()] profile.
Results are written to directly to *testsuite*.
If *select* is `None`, the defaults depend on the task:
========== =========================
Task Default value of *select*
========== =========================
Parsing `item:i-input`
Transfer `result:mrs`
Generation `result:mrs`
========== =========================
Args:
grammar (str): path to a compiled grammar image
testsuite (str): path to a [incr tsdb()] testsuite where data
will be read from (see *source*) and written to
source (str): path to a [incr tsdb()] testsuite; if `None`,
*testsuite* is used as the source of data
select (str): TSQL query for selecting processor inputs
(default depends on the processor type)
generate (bool): if `True`, generate instead of parse
(default: `False`)
transfer (bool): if `True`, transfer instead of parse
(default: `False`)
options (list): list of ACE command-line options to use when
invoking the ACE subprocess; unsupported options will
give an error message
all_items (bool): if `True`, don't exclude ignored items
(those with `i-wf==2`) when parsing
result_id (int): if given, only keep items with the specified
`result-id`
gzip (bool): if `True`, non-empty tables will be compressed
with gzip
"""
from delphin.interfaces import ace
if generate and transfer:
raise ValueError("'generate' is incompatible with 'transfer'")
if source is None:
source = testsuite
if select is None:
select = 'result:mrs' if (generate or transfer) else 'item:i-input'
if generate:
processor = ace.AceGenerator
elif transfer:
processor = ace.AceTransferer
else:
if not all_items:
select += ' where i-wf != 2'
processor = ace.AceParser
if result_id is not None:
select += ' where result-id == {}'.format(result_id)
source = itsdb.TestSuite(source)
target = itsdb.TestSuite(testsuite)
column, tablename, condition = _interpret_selection(select, source)
table = itsdb.Table(
source[tablename].fields,
tsql.select(
'* from {} {}'.format(tablename, condition),
source,
cast=False))
with processor(grammar, cmdargs=options) as cpu:
target.process(cpu, ':' + column, source=table, gzip=gzip)
def _interpret_selection(select, source):
queryobj = tsql.inspect_query('select ' + select)
projection = queryobj['projection']
if projection == '*' or len(projection) != 1:
raise ValueError("'select' must return a single column")
tablename, _, column = projection[0].rpartition(':')
if not tablename:
# query could be 'i-input from item' instead of 'item:i-input'
if len(queryobj['tables']) == 1:
tablename = queryobj['tables'][0]
# otherwise guess
else:
tablename = source.relations.find(column)[0]
try:
condition = select[select.index(' where ') + 1:]
except ValueError:
condition = ''
return column, tablename, condition
###############################################################################
### REPP ######################################################################
[docs]def repp(file, config=None, module=None, active=None,
format=None, trace_level=0):
"""
Tokenize with a Regular Expression PreProcessor (REPP).
Results are printed directly to stdout. If more programmatic
access is desired, the :mod:`delphin.repp` module provides a
similar interface.
Args:
file (str, file): filename, open file, or stream of sentence
inputs
config (str): path to a PET REPP configuration (.set) file
module (str): path to a top-level REPP module; other modules
are found by external group calls
active (list): select which modules are active; if `None`, all
are used; incompatible with *config* (default: `None`)
format (str): the output format (`"yy"`, `"string"`, `"line"`,
or `"triple"`; default: `"yy"`)
trace_level (int): if `0` no trace info is printed; if `1`,
applied rules are printed, if greather than `1`, both
applied and unapplied rules (in order) are printed
(default: `0`)
"""
from delphin.repp import REPP
if config is not None and module is not None:
raise ValueError("cannot specify both 'config' and 'module'")
if config is not None and active:
raise ValueError("'active' cannot be used with 'config'")
if config:
r = REPP.from_config(config)
elif module:
r = REPP.from_file(module, active=active)
else:
r = REPP() # just tokenize
if hasattr(file, 'read'):
for line in file:
_repp(r, line, format, trace_level)
else:
with io.open(file, encoding='utf-8') as fh:
for line in fh:
_repp(r, line, format, trace_level)
def _repp(r, line, format, trace_level):
if trace_level > 0:
for step in r.trace(line.rstrip('\n'), verbose=True):
if not hasattr(step, 'applied'):
print('Done:{}'.format(step.string))
continue
if step.applied == True or trace_level > 1:
print('{}:{!s}\n In:{}\n Out:{}'.format(
'Applied' if step.applied else 'Did not apply',
step.operation, step.input, step.output))
res = r.tokenize(line.rstrip('\n'))
if format == 'yy':
print(res)
elif format == 'string':
print(' '.join(t.form for t in res.tokens))
elif format == 'line':
for t in res.tokens:
print(t.form)
print()
elif format == 'triple':
for t in res.tokens:
if t.lnk.type == Lnk.CHARSPAN:
cfrom, cto = t.lnk.data
else:
cfrom, cto = -1, -1
print(
'({}, {}, {})'
.format(cfrom, cto, t.form)
)
print()
###############################################################################
### COMPARE ###################################################################
[docs]def compare(testsuite, gold, select='i-id i-input mrs'):
"""
Compare two [incr tsdb()] profiles.
Args:
testsuite (str, TestSuite): path to the test [incr tsdb()]
testsuite or a :class:`TestSuite` object
gold (str, TestSuite): path to the gold [incr tsdb()]
testsuite or a :class:`TestSuite` object
select: TSQL query to select (id, input, mrs) triples
(default: `i-id i-input mrs`)
Yields:
dict: Comparison results as::
{"id": "item identifier",
"input": "input sentence",
"test": number_of_unique_results_in_test,
"shared": number_of_shared_results,
"gold": number_of_unique_results_in_gold}
"""
from delphin.mrs import simplemrs, compare as mrs_compare
if not isinstance(testsuite, itsdb.TestSuite):
if isinstance(testsuite, itsdb.ItsdbProfile):
testsuite = testsuite.root
testsuite = itsdb.TestSuite(testsuite)
if not isinstance(gold, itsdb.TestSuite):
if isinstance(gold, itsdb.ItsdbProfile):
gold = gold.root
gold = itsdb.TestSuite(gold)
queryobj = tsql.inspect_query('select ' + select)
if len(queryobj['projection']) != 3:
raise ValueError('select does not return 3 fields: ' + select)
input_select = '{} {}'.format(queryobj['projection'][0],
queryobj['projection'][1])
i_inputs = dict(tsql.select(input_select, testsuite))
matched_rows = itsdb.match_rows(
tsql.select(select, testsuite),
tsql.select(select, gold),
0)
for (key, testrows, goldrows) in matched_rows:
(test_unique, shared, gold_unique) = mrs_compare.compare_bags(
[simplemrs.loads_one(row[2]) for row in testrows],
[simplemrs.loads_one(row[2]) for row in goldrows])
yield {'id': key,
'input': i_inputs[key],
'test': test_unique,
'shared': shared,
'gold': gold_unique}
###############################################################################
### HELPER FUNCTIONS ##########################################################
def _prepare_output_directory(path):
try:
os.makedirs(path) # exist_ok=True is available from Python 3.2
except OSError as ex: # PermissionError is available from Python 3.3
if ex.errno == 17 and os.path.isdir(path):
pass # existing directory; maybe it's usable
else:
raise