Source code for delphin.hierarchy


"""
Basic support for hierarchies.
"""

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinException


[docs] class HierarchyError(PyDelphinException): """Raised for invalid operations on hierarchies."""
def _norm_id(id): """Default id normalizer does nothing.""" return id
[docs] class MultiHierarchy: """ A Multiply-inheriting Hierarchy. Hierarchies may be constructed when instantiating the class or via the :meth:`update` method using a dictionary mapping identifiers to parents. In both cases, the parents may be a string of whitespace-separated parent identifiers or a tuple of (possibly non-string) identifiers. Also, both methods may take a `data` parameter which accepts a mapping from identifiers to arbitrary data. Data for identifiers may be get and set individually with dictionary key-access. >>> h = MultiHierarchy('*top*', {'food': '*top*', ... 'utensil': '*top*'}) >>> th.update({'fruit': 'food', 'apple': 'fruit'}) >>> th['apple'] = 'info about apples' >>> th.update({'knife': 'utensil'}, ... data={'knife': 'info about knives'}) >>> th.update({'vegetable': 'food', 'tomato': 'fruit vegetable'}) In some ways the MultiHierarchy behaves like a dictionary, but it is not a subclass of :py:class:`dict` and does not implement all its methods. Also note that some methods ignore the top node, which make certain actions easier: >>> h = Hierarchy('*top*', {'a': '*top*', 'b': 'a', 'c': 'a'}) >>> len(h) 3 >>> list(h) ['a', 'b', 'c'] >>> Hierarchy('*top*', {id: h.parents(id) for id in h}) == h True But others do not ignore the top node, namely those where you can request it specifically: >>> '*top*' in h True >>> print(h['*top*']) None >>> h.children('*top*') {'a'} Args: top: the unique top identifier hierarchy: a mapping of node identifiers to parents (see description above concerning the possible parent values) data: a mapping of node identifiers to arbitrary data normalize_identifier: a unary function used to normalize identifiers (e.g., case normalization) Attributes: top: the hierarchy's top node identifier """ def __init__(self, top, hierarchy=None, data=None, normalize_identifier=None): if not normalize_identifier: self._norm = _norm_id elif not callable(normalize_identifier): raise TypeError(f'not callable: {normalize_identifier!r}') else: self._norm = normalize_identifier top = self._norm(top) self._top = top self._hier = {top: ()} self._loer = {top: set()} self._data = {} if hierarchy is not None: self.update(hierarchy, data) @property def top(self): return self._top def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented return (self._top == other._top and self._hier == other._hier and self._data == other._data) def __getitem__(self, identifier): identifier = self._norm(identifier) data = None try: data = self._data[identifier] except KeyError: if identifier not in self: raise return data def __setitem__(self, identifier, data): identifier = self._norm(identifier) if identifier not in self: raise HierarchyError( f'cannot set data; not in hierarchy: {identifier}') self._data[identifier] = data def __iter__(self): return iter(identifier for identifier in self._hier if identifier != self._top) def __contains__(self, identifier): return self._norm(identifier) in self._hier def __len__(self): return len(self._hier) - 1 # ignore top
[docs] def items(self): """ Return the (identifier, data) pairs excluding the top node. """ value = self.__getitem__ return [(identifier, value(identifier)) for identifier in self]
[docs] def update(self, subhierarchy=None, data=None): """ Incorporate *subhierarchy* and *data* into the hierarchy. This method ensures that nodes are inserted in an order that does not result in an intermediate state being disconnected or cyclic, and raises an error if it cannot avoid such a state due to *subhierarchy* being invalid when inserted into the main hierarchy. Updates are atomic, so *subhierarchy* and *data* will not be partially applied if there is an error in the middle of the operation. Args: subhierarchy: mapping of node identifiers to parents data: mapping of node identifiers to data objects Raises: HierarchyError: when *subhierarchy* or *data* cannot be incorporated into the hierarchy Examples: >>> h = MultiHierarchy('*top*') >>> h.update({'a': '*top*'}) >>> h.update({'b': '*top*'}, data={'b': 5}) >>> h.update(data={'a': 3}) >>> h['b'] - h['a'] 2 """ subhierarchy, data = self.validate_update(subhierarchy, data) # modify these locally in case of errors hier = dict(self._hier) loer = dict(self._loer) while subhierarchy: eligible = _get_eligible(hier, subhierarchy) for identifier in eligible: parents = subhierarchy.pop(identifier) _validate_parentage(identifier, parents, hier) hier[identifier] = parents loer[identifier] = set() for parent in parents: loer[parent].add(identifier) # assign to self if all tests have passed self._hier = hier self._loer = loer self._data.update(data)
[docs] def parents(self, identifier): """Return the immediate parents of *identifier*.""" identifier = self._norm(identifier) return self._hier[identifier]
[docs] def children(self, identifier): """Return the immediate children of *identifier*.""" identifier = self._norm(identifier) return self._loer[identifier]
[docs] def ancestors(self, identifier): """Return the ancestors of *identifier*.""" identifier = self._norm(identifier) return _ancestors(identifier, self._hier)
[docs] def descendants(self, identifier): """Return the descendants of *identifier*.""" identifier = self._norm(identifier) xs = set() for child in self._loer[identifier]: xs.add(child) xs.update(self.descendants(child)) return xs
[docs] def subsumes(self, a, b): """ Return `True` if node *a* subsumes node *b*. A node is subsumed by the other if it is a descendant of the other node or if it is the other node. It is not a commutative operation, so `subsumes(a, b) != subsumes(b, a)`, except for the case where `a == b`. Args: a: a node identifier b: a node identifier Examples: >>> h = MultiHierarchy('*top*', {'a': '*top*', ... 'b': '*top*', ... 'c': 'b'}) >>> all(h.subsumes(h.top, x) for x in h) True >>> h.subsumes('a', h.top) False >>> h.subsumes('a', 'b') False >>> h.subsumes('b', 'c') True """ norm = self._norm a, b = norm(a), norm(b) return a == b or b in self.descendants(a)
[docs] def compatible(self, a, b): """ Return `True` if node *a* is compatible with node *b*. In a multiply-inheriting hierarchy, node compatibility means that two nodes share a common descendant. It is a commutative operation, so `compatible(a, b) == compatible(b, a)`. Note that in a singly-inheriting hierarchy, two nodes are never compatible by this metric. Args: a: a node identifier b: a node identifier Examples: >>> h = MultiHierarchy('*top*', {'a': '*top*', ... 'b': '*top*'}) >>> h.compatible('a', 'b') False >>> h.update({'c': 'a b'}) >>> h.compatible('a', 'b') True """ norm = self._norm a, b = norm(a), norm(b) a_lineage = self.descendants(a).union([a]) b_lineage = self.descendants(b).union([b]) return len(a_lineage.intersection(b_lineage)) > 0
[docs] def validate_update(self, subhierarchy, data): """ Check if the update can apply to the current hierarchy. This method returns (*subhierarchy*, *data*) with normalized identifiers if the update is valid, otherwise it will raise a :exc:`HierarchyError`. Raises: HierarchyError: when the update is invalid """ subhierarchy, data = _normalize_update(self._norm, subhierarchy, data) ids = set(self._hier).intersection(subhierarchy) if ids: raise HierarchyError( 'already in hierarchy: {}'.format(', '.join(ids))) ids = set(data).difference(set(self._hier).union(subhierarchy)) if ids: raise HierarchyError( 'cannot update data; not in hierarchy: {}' .format(', '.join(ids))) return subhierarchy, data
def _ancestors(id, hier): xs = set() for parent in hier[id]: xs.add(parent) xs.update(_ancestors(parent, hier)) return xs def _normalize_update(norm, subhierarchy, data): sub = {} if subhierarchy: for id, parents in subhierarchy.items(): if isinstance(parents, str): parents = parents.split() id = norm(id) parents = tuple(map(norm, parents)) sub[id] = parents dat = {} if data: dat = {norm(id): obj for id, obj in data.items()} return sub, dat def _get_eligible(hier, sub): eligible = [id for id, parents in sub.items() if all(parent in hier for parent in parents)] if not eligible: raise HierarchyError( 'disconnected or cyclic hierarchy; remaining: {}' .format(', '.join(sub))) return eligible def _validate_parentage(id, parents, hier): ancestors = set() for parent in parents: ancestors.update(_ancestors(parent, hier)) redundant = ancestors.intersection(parents) if redundant: raise HierarchyError( '{} has redundant parents: {}' .format(id, ', '.join(sorted(redundant)))) # single-parented hierarchy might be something like this: # class Hierarchy(MultiHierarchy): # def parent(self, identifier): # identifier = self._norm(identifier) # parents = self._hier[identifier] # if parents: # parents = parents[0] # return parents # def validate_update(self, subhierarchy, data): # """ # Check if the update can apply to the current hierarchy. # Raises: # HierarchyError: when the update is invalid # """ # subhierarchy, data = super().validate_update(subhierarchy, data) # for id, parents in subhierarchy.items(): # if len(parents) != 1: # raise HierarchyError( # '{} has more than one parent: {}' # .format(id, ', '.join(parents))) # return subhierarchy, data