Source code for delphin.interfaces.base

from collections import Sequence
from datetime import datetime

from delphin.derivation import Derivation
from delphin.tokens import YyTokenLattice
from delphin.mrs import (
from delphin.util import SExpr, stringtypes

[docs]class Processor(object): """ Base class for processors. This class defines the basic interface for all PyDelphin processors, such as :class:`~delphin.interfaces.ace.AceProcess` and :class:``. It can also be used to define preprocessor wrappers of other processors such that it has the same interface, allowing it to be used, e.g., with :meth:`TestSuite.process() <delphin.itsdb.TestSuite.process>`. Attributes: task: name of the task the processor performs (e.g. `"parse"`, `"transfer"`, or `"generate"`) """ task = None
[docs] def process_item(self, datum, keys=None): """ Send *datum* to the processor and return the result. This method is a generic wrapper around a processor-specific processing method that keeps track of additional item and processor information. Specifically, if *keys* is provided, it is copied into the `keys` key of the response object, and if the processor object's `task` member is non-`None`, it is copied into the `task` key of the response. These help with keeping track of items when many are processed at once, and to help downstream functions identify what the process did. Args: datum: the item content to process keys: a mapping of item identifiers which will be copied into the response """ raise NotImplementedError()
[docs]class ParseResult(dict): """ A wrapper around a result dictionary to automate deserialization for supported formats. A ParseResult is still a dictionary, so the raw data can be obtained using dict access. """ def __repr__(self): return 'ParseResult({})'.format(dict.__repr__(self))
[docs] def derivation(self): """ Deserialize and return a Derivation object for UDF- or JSON-formatted derivation data; otherwise return the original string. """ drv = self.get('derivation') if drv is not None: if isinstance(drv, dict): drv = Derivation.from_dict(drv) elif isinstance(drv, stringtypes): drv = Derivation.from_string(drv) return drv
[docs] def tree(self): """ Deserialize and return a labeled syntax tree. The tree data may be a standalone datum, or embedded in the derivation. """ tree = self.get('tree') if isinstance(tree, stringtypes): tree = SExpr.parse(tree).data elif tree is None: drv = self.get('derivation') if isinstance(drv, dict) and 'label' in drv: def _extract_tree(d): t = [d.get('label', '')] if 'tokens' in d: t.append([d.get('form', '')]) else: for dtr in d.get('daughters', []): t.append(_extract_tree(dtr)) return t tree = _extract_tree(drv) return tree
[docs] def mrs(self): """ Deserialize and return an Mrs object for simplemrs or JSON-formatted MRS data; otherwise return the original string. """ mrs = self.get('mrs') if mrs is not None: if isinstance(mrs, dict): mrs = Mrs.from_dict(mrs) elif isinstance(mrs, stringtypes): mrs = simplemrs.loads_one(mrs) return mrs
[docs] def eds(self): """ Deserialize and return an Eds object for native- or JSON-formatted EDS data; otherwise return the original string. """ _eds = self.get('eds') if _eds is not None: if isinstance(_eds, dict): _eds = eds.Eds.from_dict(_eds) elif isinstance(_eds, stringtypes): _eds = eds.loads_one(_eds) return _eds
[docs] def dmrs(self): """ Deserialize and return a Dmrs object for JSON-formatted DMRS data; otherwise return the original string. """ dmrs = self.get('dmrs') if dmrs is not None: if isinstance(dmrs, dict): dmrs = Dmrs.from_dict(dmrs) return dmrs
[docs]class ParseResponse(dict): """ A wrapper around the response dictionary for more convenient access to results. """ _result_factory = ParseResult def __repr__(self): return 'ParseResponse({})'.format(dict.__repr__(self))
[docs] def results(self): """Return ParseResult objects for each result.""" return [self._result_factory(r) for r in self.get('results', [])]
[docs] def result(self, i): """Return a ParseResult object for the *i*\ th result.""" return self._result_factory(self.get('results', [])[i])
[docs] def tokens(self, tokenset='internal'): """ Deserialize and return a YyTokenLattice object for the initial or internal token set, if provided, from the YY format or the JSON-formatted data; otherwise return the original string. Args: tokenset (str): return `'initial'` or `'internal'` tokens (default: `'internal'`) Returns: :class:`YyTokenLattice` """ toks = self.get('tokens', {}).get(tokenset) if toks is not None: if isinstance(toks, stringtypes): toks = YyTokenLattice.from_string(toks) elif isinstance(toks, Sequence): toks = YyTokenLattice.from_list(toks) return toks
[docs]class FieldMapper(object): """ A class for mapping responses to [incr tsdb()] fields. This class provides two methods for mapping responses to fields: * map() - takes a response and returns a list of (table, data) tuples for the data in the response, as well as aggregating any necessary information * cleanup() - returns any (table, data) tuples resulting from aggregated data over all runs, then clears this data In addition, the :attr:`affected_tables` attribute should list the names of tables that become invalidated by using this FieldMapper to process a profile. Generally this is the list of tables that :meth:`map` and :meth:`cleanup` create records for, but it may also include those that rely on the previous set (e.g., treebanking preferences, etc.). Alternative [incr tsdb()] schema can be handled by overriding these two methods and the __init__() method. Attributes: affected_tables: list of tables that are affected by the processing """ def __init__(self): # the parse keys exclude some that are handled specially self._parse_keys = ''' ninputs ntokens readings first total tcpu tgc treal words l-stasks p-ctasks p-ftasks p-etasks p-stasks aedges pedges raedges rpedges tedges eedges ledges sedges redges unifications copies conses symbols others gcs i-load a-load date error comment '''.split() self._result_keys = ''' result-id time r-ctasks r-ftasks r-etasks r-stasks size r-aedges r-pedges derivation surface tree mrs '''.split() self._run_keys = ''' run-comment platform protocol tsdb application environment grammar avms sorts templates lexicon lrules rules user host os start end items status '''.split() self._parse_id = -1 self._runs = {} self._last_run_id = -1 self.affected_tables = ''' run parse result rule output edge tree decision preference update fold score '''.split()
[docs] def map(self, response): """ Process *response* and return a list of (table, rowdata) tuples. """ inserts = [] parse = {} # custom remapping, cleanup, and filling in holes parse['i-id'] = response.get('keys', {}).get('i-id', -1) self._parse_id = max(self._parse_id + 1, parse['i-id']) parse['parse-id'] = self._parse_id parse['run-id'] = response.get('run', {}).get('run-id', -1) if 'tokens' in response: parse['p-input'] = response['tokens'].get('initial') parse['p-tokens'] = response['tokens'].get('internal') if 'ninputs' not in response: toks = response.tokens('initial') if toks is not None: response['ninputs'] = len(toks.tokens) if 'ntokens' not in response: toks = response.tokens('internal') if toks is not None: response['ntokens'] = len(toks.tokens) if 'readings' not in response and 'results' in response: response['readings'] = len(response['results']) # basic mapping for key in self._parse_keys: if key in response: parse[key] = response[key] inserts.append(('parse', parse)) for result in response.get('results', []): d = {'parse-id': self._parse_id} if 'flags' in result: d['flags'] = SExpr.format(result['flags']) for key in self._result_keys: if key in result: d[key] = result[key] inserts.append(('result', d)) if 'run' in response: run_id = response['run'].get('run-id', -1) # check if last run was not closed properly if run_id not in self._runs and self._last_run_id in self._runs: last_run = self._runs[self._last_run_id] if 'end' not in last_run: last_run['end'] = self._runs[run_id] = response['run'] self._last_run_id = run_id return inserts
[docs] def cleanup(self): """ Return aggregated (table, rowdata) tuples and clear the state. """ inserts = [] last_run = self._runs[self._last_run_id] if 'end' not in last_run: last_run['end'] = for run_id in sorted(self._runs): run = self._runs[run_id] d = {'run-id': run.get('run-id', -1)} for key in self._run_keys: if key in run: d[key] = run[key] inserts.append(('run', d)) # reset for next task self._parse_id = -1 self._runs = {} self._last_run_id = -1 return inserts