Source code for delphin.commands


"""
PyDelphin API counterparts to the `delphin` commands.
"""

import sys
from pathlib import Path
import tempfile
import importlib
import logging
import warnings

try:
    from delphin import highlight as _delphin_hl
    from pygments import highlight as _highlight
    from pygments.formatters import Terminal256Formatter as _Formatter
except ImportError:
    simplemrs_highlight = None
else:
    _lexer = _delphin_hl.SimpleMRSLexer()
    _formatter = _Formatter(style=_delphin_hl.MRSStyle)

    def simplemrs_highlight(text):
        return _highlight(text, _lexer, _formatter)


from delphin import exceptions
from delphin import tsdb, itsdb, tsql
from delphin.lnk import Lnk
from delphin.semi import SemI, load as load_semi
from delphin import util
from delphin.exceptions import PyDelphinException, PyDelphinWarning
import delphin.codecs
# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401


# EXCEPTIONS ##################################################################

[docs]class CommandError(exceptions.PyDelphinException): """Raised on an invalid command call."""
############################################################################### # CONVERT ##################################################################### _CODECS = util.namespace_modules(delphin.codecs)
[docs]def convert(path, source_fmt, target_fmt, select='result.mrs', properties=True, lnk=True, color=False, indent=None, show_status=False, predicate_modifiers=False, semi=None): """ Convert between various DELPH-IN Semantics representations. The *source_fmt* and *target_fmt* arguments are downcased and hyphens are removed to normalize the codec name. Note: For syntax highlighting, `delphin.highlight`_ must be installed, and it is only available for select target formats. .. _delphin.highlight: https://github.com/delph-in/delphin.highlight Args: path (str, file): filename, testsuite directory, open file, or stream of input representations source_fmt (str): convert from this format target_fmt (str): convert to this format select (str): TSQL query for selecting data (ignored if *path* is not a testsuite directory; default: `"result:mrs"`) properties (bool): include morphosemantic properties if `True` (default: `True`) lnk (bool): include lnk surface alignments and surface strings if `True` (default: `True`) color (bool): apply syntax highlighting if `True` and *target_fmt* is `"simplemrs"` (default: `False`) indent (int, optional): specifies an explicit number of spaces for indentation show_status (bool): show disconnected EDS nodes (ignored if *target_fmt* is not `"eds"`; default: `False`) predicate_modifiers (bool): apply EDS predicate modification for certain kinds of patterns (ignored if *target_fmt* is not an EDS format; default: `False`) semi: a :class:`delphin.semi.SemI` object or path to a SEM-I (ignored if *target_fmt* is not `indexedmrs`) Returns: str: the converted representation """ if path is None: path = sys.stdin # normalize codec names source_fmt = source_fmt.replace('-', '').lower() target_fmt = target_fmt.replace('-', '').lower() source_codec = _get_codec(source_fmt) target_codec = _get_codec(target_fmt) converter = _get_converter(source_codec, target_codec, predicate_modifiers) if indent is not True and indent is not False and indent is not None: indent = int(indent) if len(tsql.inspect_query('select ' + select)['projection']) != 1: raise CommandError( 'Exactly 1 column must be given in selection query: ' '(e.g., result.mrs)') if semi is not None and not isinstance(semi, SemI): # lets ignore the SEM-I warnings until questions regarding # valid SEM-Is are resolved with warnings.catch_warnings(): warnings.simplefilter('ignore') semi = load_semi(semi) # read kwargs = {} if source_fmt == 'indexedmrs' and semi is not None: kwargs['semi'] = semi if hasattr(path, 'read'): xs = list(source_codec.load(path, **kwargs)) else: path = Path(path).expanduser() if path.is_dir(): db = tsdb.Database(path) # ts = itsdb.TestSuite(path) xs = [ next(iter(source_codec.loads(r[0], **kwargs)), None) for r in tsql.select(select, db) ] else: xs = list(source_codec.load(path, **kwargs)) # convert if source representation != target representation if converter: xs = map(converter, xs) # write kwargs = {} if indent: kwargs['indent'] = indent if target_fmt == 'eds': kwargs['show_status'] = show_status # if target_fmt.startswith('eds'): # kwargs['predicate_modifiers'] = predicate_modifiers if target_fmt == 'indexedmrs' and semi is not None: kwargs['semi'] = semi kwargs['properties'] = properties kwargs['lnk'] = lnk # Manually dealing with headers, joiners, and footers is to # accommodate streaming output. Otherwise it is the same as # calling the following: # target_codec.dumps(xs, **kwargs) header = getattr(target_codec, 'HEADER', '') joiner = getattr(target_codec, 'JOINER', ' ') footer = getattr(target_codec, 'FOOTER', '') if indent is not None: if header: header += '\n' joiner = joiner.strip() + '\n' if footer: footer = '\n' + footer parts = [] for x in xs: try: s = target_codec.encode(x, **kwargs) except (PyDelphinException, KeyError, IndexError): logging.exception('could not convert representation') else: parts.append(s) output = header + joiner.join(parts) + footer if color and target_fmt in ('simplemrs', 'simple-mrs'): output = _colorize(output) return output
def _get_codec(name): if name not in _CODECS: raise CommandError('invalid codec: {}'.format(name)) fullname = _CODECS[name] codec = importlib.import_module(fullname) return codec def _get_converter(source_codec, target_codec, predicate_modifiers): src_rep = source_codec.CODEC_INFO['representation'].lower() tgt_rep = target_codec.CODEC_INFO['representation'].lower() # The following could be done dynamically by inspecting if the # target representation has a from_{src_rep} function, but that # seems like overkill, and it's not clear what to do about # EDS's predicate_modifiers argument in that case. if (src_rep, tgt_rep) == ('mrs', 'dmrs'): from delphin.dmrs import from_mrs as converter elif (src_rep, tgt_rep) == ('dmrs', 'mrs'): from delphin.mrs import from_dmrs as converter elif (src_rep, tgt_rep) == ('mrs', 'eds'): from delphin.eds import from_mrs def converter(m): return from_mrs(m, predicate_modifiers=predicate_modifiers) elif src_rep == tgt_rep: converter = None else: raise CommandError('{} -> {} conversion is not supported'.format( src_rep.upper(), tgt_rep.upper())) return converter def _colorize(text): if simplemrs_highlight: return simplemrs_highlight(text) else: return text ############################################################################### # SELECT ######################################################################
[docs]def select(query: str, path: util.PathLike, record_class=None): """ Select data from [incr tsdb()] test suites. Args: query (str): TSQL select query (e.g., `'i-id i-input mrs'` or `'* from item where readings > 0'`) path: path to a TSDB test suite record_class: alternative class for records in the selection Yields: selected data from the test suite """ db = tsdb.Database(path, autocast=True) return tsql.select(query, db, record_class=record_class)
############################################################################### # MKPROF ######################################################################
[docs]def mkprof(destination, source=None, schema=None, where=None, delimiter=None, refresh=False, skeleton=False, full=False, gzip=False, quiet=False): """ Create [incr tsdb()] profiles or skeletons. Data for the testsuite may come from an existing testsuite or from a list of sentences. There are four main usage patterns: - `source="testsuite/"` -- read data from `testsuite/` - `source=None, refresh=True` -- read data from *destination* - `source=None, refresh=False` -- read sentences from stdin - `source="sents.txt"` -- read sentences from `sents.txt` The latter two require the *schema* parameter. Args: destination (str): path of the new testsuite source (str): path to a source testsuite or a file containing sentences; if not given and *refresh* is `False`, sentences are read from stdin schema (str): path to a relations file to use for the created testsuite; if `None` and *source* is a test suite, the schema of *source* is used where (str): TSQL condition to filter records by; ignored if *source* is not a testsuite delimiter (str): if given, split lines from *source* or stdin on the character *delimiter*; if *delimiter* is `"@"`, split using :func:`delphin.tsdb.split`; a header line with field names is required; ignored when the data source is not text lines refresh (bool): if `True`, rewrite the data at *destination*; implies *full* is `True`; ignored if *source* is not `None`, best combined with *schema* or *gzip* (default: `False`) skeleton (bool): if `True`, only write tsdb-core files (default: `False`) full (bool): if `True`, copy all data from the source testsuite; ignored if the data source is not a testsuite or if *skeleton* is `True` (default: `False`) gzip (bool): if `True`, non-empty tables will be compressed with gzip quiet (bool): if `True`, don't print summary information """ destination = Path(destination).expanduser() if source is not None: source = Path(source).expanduser() if schema is not None: schema = tsdb.read_schema(schema) old_relation_files = [] # work in-place on destination test suite if source is None and refresh: db = tsdb.Database(destination) old_relation_files = list(db.schema) tsdb.write_database(db, db.path, schema=schema, gzip=gzip) # input is sentences on stdin or a file of sentences elif source is None and not refresh: _mkprof_from_lines( destination, sys.stdin, schema, delimiter, gzip) elif source.is_file(): with source.open() as fh: _mkprof_from_lines( destination, fh, schema, delimiter, gzip) # input is source testsuite elif source.is_dir(): db = tsdb.Database(source) old_relation_files = list(db.schema) _mkprof_from_database( destination, db, schema, where, full, gzip) else: raise CommandError('invalid source for mkprof: {!r}'.format(source)) _mkprof_cleanup(destination, skeleton, old_relation_files) if not quiet: _mkprof_summarize(destination, tsdb.read_schema(destination))
def _mkprof_from_lines(destination, stream, schema, delimiter, gzip): if not schema: raise CommandError( 'a schema is required to make a testsuite from text') lineiter = iter(stream) colnames, split = _make_split(delimiter, lineiter) # setup destination testsuite tsdb.initialize_database(destination, schema, files=True) tsdb.write(destination, 'item', _lines_to_records(lineiter, colnames, split, schema['item']), fields=schema['item'], gzip=gzip) def _lines_to_records(lineiter, colnames, split, fields): with_i_id = with_i_length = False for field in fields: if field.name == 'i-id': with_i_id = True elif field.name == 'i-length': with_i_length = True i_ids = set() for i, line in enumerate(lineiter, 1): colvals = split(line.rstrip('\n')) if len(colvals) != len(colnames): raise CommandError( 'line values do not match expected fields:\n' ' fields: {}\n' ' values: {}'.format(', '.join(colnames), ', '.join(colvals))) colmap = dict(zip(colnames, colvals)) if with_i_id: if 'i-id' not in colmap: colmap['i-id'] = i if colmap['i-id'] in i_ids: raise CommandError('duplicate i-id: {}' .format(colmap['i-id'])) i_ids.add(colmap['i-id']) if with_i_length and 'i-length' not in colmap and 'i-input' in colmap: colmap['i-length'] = len(colmap['i-input'].split()) yield tsdb.make_record(colmap, fields) def _make_split(delimiter, lineiter): if not delimiter: def split(line): return (0, line[1:]) if line.startswith('*') else (1, line) colnames = ('i-wf', 'i-input') else: if delimiter == '@': split = tsdb.split else: def split(line): return line.split(delimiter) colnames = split(next(lineiter)) return colnames, split def _mkprof_from_database(destination, db, schema, where, full, gzip): if schema is None: schema = db.schema destination.mkdir(exist_ok=True) tsdb.write_schema(destination, schema) to_copy = set(schema if full else tsdb.TSDB_CORE_FILES) where = '' if where is None else 'where ' + where for table in schema: if table not in to_copy or _no_such_relation(db, table): records = [] elif where: # filter the data, but use all if the query fails # (e.g., if the filter and table cannot be joined) try: records = tsql.select( '* from {} {}'.format(table, where), db) except tsql.TSQLError: records = list(db[table]) else: records = list(db[table]) tsdb.write(destination, table, records, schema[table], gzip=gzip) def _no_such_relation(db, name): """ Return True if the relation *name* is not defined in *db* or does not exist, otherwise False. """ if name not in db: return True try: tsdb.get_path(db.path, name) except tsdb.TSDBError: return True return False def _mkprof_cleanup(destination, skeleton, old_files): schema = tsdb.read_schema(destination) to_keep = set(schema) if skeleton: to_keep = to_keep.intersection(tsdb.TSDB_CORE_FILES) for name in set(schema).union(old_files): tx_path = destination.joinpath(name).with_suffix('') gz_path = destination.joinpath(name).with_suffix('.gz') if (tx_path.is_file() and (name not in to_keep or (skeleton and tx_path.stat().st_size == 0))): tx_path.unlink() if (gz_path.is_file() and (name not in to_keep or (skeleton and gz_path.stat().st_size == 0))): gz_path.unlink() def _mkprof_summarize(destination, schema): # summarize what was done isatty = sys.stdout.isatty() def _red(s): return '\x1b[1;31m{}\x1b[0m'.format(s) if isatty else s fmt = '{:>8} bytes\t{}' for filename in ['relations'] + list(schema): path = destination.joinpath(filename) if path.is_file(): stat = path.stat() print(fmt.format(stat.st_size, filename)) elif path.with_suffix('.gz').is_file(): stat = path.with_suffix('.gz').stat() print(fmt.format(stat.st_size, _red(filename + '.gz'))) ############################################################################### # PROCESS #####################################################################
[docs]def process(grammar, testsuite, source=None, select=None, generate=False, transfer=False, full_forest=False, options=None, all_items=False, result_id=None, gzip=False, stderr=None): """ Process (e.g., parse) a [incr tsdb()] profile. Results are written to directly to *testsuite*. If *select* is `None`, the defaults depend on the task: ========== ========================= Task Default value of *select* ========== ========================= Parsing `item.i-input` Transfer `result.mrs` Generation `result.mrs` ========== ========================= Args: grammar (str): path to a compiled grammar image testsuite (str): path to a [incr tsdb()] testsuite where data will be read from (see *source*) and written to source (str): path to a [incr tsdb()] testsuite; if `None`, *testsuite* is used as the source of data select (str): TSQL query for selecting processor inputs (default depends on the processor type) generate (bool): if `True`, generate instead of parse (default: `False`) transfer (bool): if `True`, transfer instead of parse (default: `False`) options (list): list of ACE command-line options to use when invoking the ACE subprocess; unsupported options will give an error message all_items (bool): if `True`, don't exclude ignored items (those with `i-wf==2`) when parsing result_id (int): if given, only keep items with the specified `result-id` gzip (bool): if `True`, non-empty tables will be compressed with gzip stderr (file): stream for ACE's stderr """ from delphin import ace grammar = Path(grammar).expanduser() testsuite = Path(testsuite).expanduser() kwargs = {} kwargs['stderr'] = stderr if sum(1 if mode else 0 for mode in (generate, transfer, full_forest)) > 1: raise CommandError("'generate', 'transfer', and 'full-forest' " "are mutually exclusive") if source is None: source = testsuite if select is None: select = 'result.mrs' if (generate or transfer) else 'item.i-input' if generate: processor = ace.ACEGenerator elif transfer: processor = ace.ACETransferer else: if full_forest: kwargs['full_forest'] = True if not all_items: select += ' where i-wf != 2' processor = ace.ACEParser if result_id is not None: select += ' where result-id == {}'.format(result_id) target = itsdb.TestSuite(testsuite) column, tablename, condition = _interpret_selection(select, source) with tempfile.TemporaryDirectory() as dir: # use a temporary test suite directory for filtered inputs mkprof(dir, source=source, where=condition, full=True, gzip=True, quiet=True) tmp = itsdb.TestSuite(dir) with processor(grammar, cmdargs=options, **kwargs) as cpu: target.process(cpu, selector=(tablename, column), source=tmp, gzip=gzip)
def _interpret_selection(select, source): queryobj = tsql.inspect_query('select ' + select) projection = queryobj['projection'] if projection == '*' or len(projection) != 1: raise CommandError("'select' must return a single column") tablename, _, column = projection[0].rpartition('.') if not tablename: # query could be 'i-input from item' instead of 'item.i-input' if len(queryobj['relations']) == 1: tablename = queryobj['relations'][0] # otherwise guess else: schema = tsdb.read_schema(source) tablename = next( table for table in schema if any(f.name == column for f in schema[table])) try: condition = select[select.index(' where ') + 7:] except ValueError: condition = '' return column, tablename, condition ############################################################################### # REPP ########################################################################
[docs]def repp(source, config=None, module=None, active=None, format=None, trace_level=0): """ Tokenize with a Regular Expression PreProcessor (REPP). Results are printed directly to stdout. If more programmatic access is desired, the :mod:`delphin.repp` module provides a similar interface. Args: source (str, file): filename, open file, or stream of sentence inputs config (str): path to a PET REPP configuration (.set) file module (str): path to a top-level REPP module; other modules are found by external group calls active (list): select which modules are active; if `None`, all are used; incompatible with *config* (default: `None`) format (str): the output format (`"yy"`, `"string"`, `"line"`, or `"triple"`; default: `"yy"`) trace_level (int): if `0` no trace info is printed; if `1`, applied rules are printed, if greather than `1`, both applied and unapplied rules (in order) are printed (default: `0`) """ from delphin.repp import REPP if config is not None and module is not None: raise CommandError("cannot specify both 'config' and 'module'") if config is not None and active: raise CommandError("'active' cannot be used with 'config'") if config: r = REPP.from_config(config) elif module: r = REPP.from_file(module, active=active) else: r = REPP() # just tokenize if hasattr(source, 'read'): for line in source: _repp(r, line, format, trace_level) else: source = Path(source).expanduser() with source.open(encoding='utf-8') as fh: for line in fh: _repp(r, line, format, trace_level)
def _repp(r, line, format, trace_level): if trace_level > 0: for step in r.trace(line.rstrip('\n'), verbose=True): if not hasattr(step, 'applied'): print('Done:{}'.format(step.string)) continue if step.applied or trace_level > 1: print('{}:{!s}\n In:{}\n Out:{}'.format( 'Applied' if step.applied else 'Did not apply', step.operation, step.input, step.output)) res = r.tokenize(line.rstrip('\n')) if format == 'yy': print(res) elif format == 'string': print(' '.join(t.form for t in res.tokens)) elif format == 'line': for t in res.tokens: print(t.form) print() elif format == 'triple': for t in res.tokens: if t.lnk.type == Lnk.CHARSPAN: cfrom, cto = t.lnk.data else: cfrom, cto = -1, -1 print( '({}, {}, {})' .format(cfrom, cto, t.form) ) print() ############################################################################### # COMPARE #####################################################################
[docs]def compare(testsuite, gold, select='i-id i-input mrs'): """ Compare two [incr tsdb()] profiles. Args: testsuite (str, TestSuite): path to the test [incr tsdb()] testsuite or a :class:`TestSuite` object gold (str, TestSuite): path to the gold [incr tsdb()] testsuite or a :class:`TestSuite` object select: TSQL query to select (id, input, mrs) triples (default: `i-id i-input mrs`) Yields: dict: Comparison results as:: {"id": "item identifier", "input": "input sentence", "test": number_of_unique_results_in_test, "shared": number_of_shared_results, "gold": number_of_unique_results_in_gold} """ from delphin import mrs from delphin.codecs import simplemrs if not isinstance(testsuite, itsdb.TestSuite): source = Path(testsuite).expanduser() testsuite = itsdb.TestSuite(source) if not isinstance(gold, itsdb.TestSuite): source = Path(gold).expanduser() gold = itsdb.TestSuite(source) queryobj = tsql.inspect_query('select ' + select) if len(queryobj['projection']) != 3: raise CommandError('select does not return 3 fields: ' + select) input_select = '{} {}'.format(queryobj['projection'][0], queryobj['projection'][1]) i_inputs = dict(tsql.select(input_select, testsuite)) matched_rows = itsdb.match_rows( tsql.select(select, testsuite), tsql.select(select, gold), 0) for (key, testrows, goldrows) in matched_rows: (test_unique, shared, gold_unique) = mrs.compare_bags( [simplemrs.decode(row[2]) for row in testrows], [simplemrs.decode(row[2]) for row in goldrows]) yield {'id': key, 'input': i_inputs.get(key), 'test': test_unique, 'shared': shared, 'gold': gold_unique}