Source code for delphin.mrs.compare


from collections import defaultdict
from itertools import permutations, groupby, chain
from functools import partial

import networkx as nx

from delphin.mrs.components import var_id, var_sort
from delphin.mrs.config import CONSTARG_ROLE, IVARG_ROLE

# NOTES:
#  the somewhat naive implementations in _node_isomorphic and
#  _var_isomorphic have terrible runtimes for the pathological case,
#  and I didn't finish the Turbo_ISO implementation before I had to
#  move on, so I'm conceding defeat for the time being and using
#  networkx just for isomorphism (so I build a nx DiGraph here, then
#  throw it away when i'm done).

[docs]def isomorphic(q, g, check_varprops=True): """ Return `True` if Xmrs objects *q* and *g* are isomorphic. Isomorphicity compares the predicates of an Xmrs, the variable properties of their predications (if `check_varprops=True`), constant arguments, and the argument structure between predications. Node IDs and Lnk values are ignored. Args: q: the left Xmrs to compare g: the right Xmrs to compare check_varprops: if `True`, make sure variable properties are equal for mapped predications """ qdg = _make_digraph(q, check_varprops) gdg = _make_digraph(g, check_varprops) def nem(qd, gd): # node-edge-match return qd.get('sig') == gd.get('sig') return nx.is_isomorphic(qdg, gdg, node_match=nem, edge_match=nem)
def _make_digraph(x, check_varprops): dg = nx.DiGraph() for ep in x.eps(): nid, pred, args = ep[0], ep[1], ep[3] if CONSTARG_ROLE in args: s = '{}({})'.format(pred.short_form(), args[CONSTARG_ROLE]) else: s = pred.short_form() dg.add_node(nid, sig=s) dg.add_edges_from((nid, var_id(val)) for role, val in args.items() if role != CONSTARG_ROLE) for var, vd in x._vars.items(): aspects = [] if check_varprops: aspects.extend('%s=%s' % (p,v) for p,v in vd['props']) aspects.extend('%s:%s' % (dg.node[tgt]['sig'], ref) for ref, tgts in vd['refs'].items() for tgt in tgts if tgt in x._eps) s = '{}|{}'.format(var_sort(var), '|'.join(sorted(aspects))) dg.add_node(var_id(var), sig=s) dg.add_edges_from((var_id(hi), var_id(lo), {'sig':reln}) for hi, reln, lo in x.hcons()) return dg def _turbo_isomorphic(q, g, check_varprops=True): """ Query Xmrs q is isomorphic to given Xmrs g if there exists an isomorphism (bijection of eps and vars) from q to g. """ # first some quick checks if len(q.eps()) != len(g.eps()): return False if len(q.variables()) != len(g.variables()): return False #if len(a.hcons()) != len(b.hcons()): return False try: next(_isomorphisms(q, g, check_varprops=check_varprops)) return True except StopIteration: return False def _isomorphisms(q, g, check_varprops=True): """ Inspired by Turbo_ISO: http://dl.acm.org/citation.cfm?id=2465300 """ # convert MRSs to be more graph-like, and add some indices qig = _IsoGraph(q, varprops=check_varprops) # qig = q isograph gig = _IsoGraph(g, varprops=check_varprops) # gig = q isograph # qsigs, qsigidx = _isomorphism_sigs(q, check_varprops) # gsigs, gsigidx = _isomorphism_sigs(g, check_varprops) # (it would be nice to not have to do this... maybe later) # qadj = _isomorphism_adj(q, qsigidx) # gadj = _isomorphism_adj(g, gsigidx) # the degree of each node is useful (but can it be combined with adj?) # qdeg = _isomorphism_deg(qadj) # gdeg = _isomorphism_deg(gadj) u_s = _isomorphism_choose_start_q_vertex(qig, gig, subgraph=False) q_ = _isomorphism_rewrite_to_NECtree(u_s, qig) for v_s in gsigs.get(qsigidx[u_s], []): cr = _isomorphism_explore_CR(q_, {v_s}, qig, gig) if cr is None: continue order = _isomorphism_determine_matching_order(q_, cr) update_state(M,F,{u_s}, {v_s}) subraph_search(q, q_, g, order, 1) # 1="the first query vertex to match" restore_state(M, F, {u_s}, {v_s}) class _IsoGraph(object): def __init__(self, x, varprops=True): self.sig = sig = defaultdict(list) self.sigidx = sigidx = {} self.adj = adj = defaultdict(lambda: defaultdict(list)) self.deg = deg = {} # generate signatures (node labels) for ep in x.eps(): # ep_sig = 'pred|{CARG}' nid, args = ep[0], ep[3] if CONSTARG_ROLE in args: s = '{}({})'.format(ep[1].string, args[CONSTARG_ROLE]) else: s = ep[1].string sig[s].append(nid) sigidx[nid] = s for var, vd in x._vars.items(): # var_sig = 'varsort|{VARPROPS_AND_REFERENCES}' aspects = ['%s=%s' % (p,v) for p,v in vd['props']] if varprops else [] aspects.extend(['%s:%s' % (sigidx[tgt], ref) for ref, tgts in vd['refs'].items() for tgt in tgts if tgt in x._eps]) s = '{}|{}'.format(var_sort(var), '|'.join(sorted(aspects))) sig[s].append(var) sigidx[var] = s # generate adjacency dicts (slightly optimized) for ep in x.eps(): nid, lbl, args, s = ep[0], ep[2], ep[3], sigidx[nid] adj[nid][sigidx[lbl]].append((lbl, 'LBL', 1)) adj[lbl][s].append((nid, 'LBL', 1)) for role, val in args.items(): if role == CONSTARG_ROLE: continue adj[nid][sigidx[val]].append((val, role, 1)) adj[val][s].append((nid, role, 1 if role == IVARG_ROLE else 0)) for hc in x._hcons.values(): hi, reln, lo = hc adj[hi][sigidx[lo]].append((lo, reln, 1)) adj[lo][sigidx[hi]].append((hi, reln, 0)) # generate a degree mapping for each node for id_, ad in adj.items(): # ad = adj dict mapping sig to adjacents # al = adjacent list; d is direction (forward=1, backward=0) deg[id_] = sum(d for al in ad.values() for _, _, d in al) def _isomorphism_choose_start_q_vertex(qig, gig, subgraph): k = 3 # max candidate start nodes; according to Turbo_ISO paper qsigidx, qadj, qdeg = q.sigidx, q.adj, q.deg gsig, gadj, gdeg = g.sig, g.adj, g.deg # rank query vertices by freq(g, L(u)) / deg(u) rank = {u: float(len(gsig[s]))/qdeg[u] for u, s in qsigidx.items()} q_s, q_s_num_candidates = None, 0 for u, rank in sorted(rank.items(), key=lambda x: x[1])[:k]: usig, udeg = qsigidx[u], qdeg[u] ulf = [(sig_, len(adjs)) for sig_, adjs in qadj.get(u, {}).items()] # cdf is the candidate degree filter # nlf is the neighborhood label (signature) frequency filter if subgraph: cdf = lambda v: udeg <= gdeg.get(v, 0) nlf = lambda v: all(n <= len(gadj[v].get(s, [])) for s, n in ulf) vs = list(filter(nlf, filter(cdf, gsig.get(usig, [])))) else: cdf = lambda v: udeg == gdeg.get(v, 0) nlf = lambda v: all(n == len(gadj[v].get(s, [])) for s, n in ulf) vs = list(filter(nlf, filter(cdf, gsig.get(usig, [])))) # find q_s with minimum number of candidate regions if q_s is None or len(vs) < q_s_num_candidates: q_s = u q_s_num_candidates = len(vs) return q_s def _isomorphism_rewrite_to_NECtree(q_s, qgraph): """ Neighborhood Equivalence Class tree (see Turbo_ISO paper) """ qadj = qgraph.adj adjsets = lambda x: set(chain.from_iterable(qadj[x].values())) t = ([q_s], []) # (NEC_set, children) visited = {q_s} vcur, vnext = [t], [] while vcur: for (nec, children) in vcur: c = defaultdict(list) for u in nec: for sig, adjlist in qadj[u].items(): c[sig].extend(x for x, _, _ in adjlist if x not in visited) for sig, c_adjlist in c.items(): visited.update(c_adjlist) # these are already grouped by label; now group by adjacents for key, grp in groupby(c_adjlist, key=adjsets): grp = list(grp) if len(grp) > 1: children.append((list(grp), [])) else: # NOTE: the paper says to look for mergeable things, # but I don't know what else to merge by. children.append((list(grp), [])) vnext.extend(children) vcur, vnext = vnext, [] return t def _isomorphism_explore_CR(q_, v_m, qig, gig): nec, children = q_ gadj, gdeg = gig.adj, gig.deg u = q_[0][0] # q_[0][0] is q_.NEC[1] (1-based index) ulf = [(s, len(adjs)) for s, adjs in qig.adj.get(u, {}).items()] nlf = lambda v: all(n <= len(gadj[v].get(s, [])) for s, n in ulf) for v in v_m: if q_deg > gig.deg[v] or not nlf(v): continue def _node_isomorphic(a, b, check_varprops=True): """ Two Xmrs objects are isomorphic if they have the same structure as determined by variable linkages between preds. """ # first some quick checks a_var_refs = sorted(len(vd['refs']) for vd in a._vars.values()) b_var_refs = sorted(len(vd['refs']) for vd in b._vars.values()) if a_var_refs != b_var_refs: return False print() # these signature: [node] indices are meant to avoid unnecessary # comparisons; they also take care of "semantic feasibility" # constraints (comparing node values and properties). All that's # left is the "syntactic feasibility", or node-edge shapes. # nodedicts are {sig: [(id, edges), ...], ...} a_nd = _node_isomorphic_build_nodedict(a, check_varprops) #print('a', a_nd) b_nd = _node_isomorphic_build_nodedict(b, check_varprops) #print('b', b_nd) #return a_sigs = {} # for node -> sig mapping # don't recurse when things are unique agenda = [] isomap = {} for sig, a_pairs in sorted(a_nd.items(), key=lambda x: len(x[1])): b_pairs = b_nd.get(sig, []) if len(a_pairs) != len(b_pairs): return False if len(a_pairs) == 1: a_, a_edges = a_pairs[0] b_, b_edges = b_pairs[0] if len(a_edges) != len(b_edges): return False a_sigs[a_] = sig isomap[a_] = b_ for edge, a_tgt in a_edges.items(): if edge not in b_edges: return False isomap[a_tgt] = b_edges[edge] else: for a_, ed in a_pairs: a_sigs[a_] = sig agenda.append((a_, sig, ed)) #print(agenda) #return isomaps = _node_isomorphic(agenda, a_sigs, b_nd, isomap, {}) # for sig, a_candidates in sorted(a_nodes.items(), key=lambda x: len(x[1])): # b_candidates = b_nodes.get(sig, []) # if len(a_candidates) != len(b_candidates): return False # candidates.append((a_candidates, b_candidates)) # # nodemaps = _isomorphic(a, b, candidates, {}) try: next(isomaps) return True except StopIteration: return False def _node_isomorphic_build_nodedict(x, varprops): # nodes are eps and vars: # preds: {ep_sig: (nid, {arg_or_lbl: tgt})} # ep_sig = 'pred|{CARG}' # vars: {var_sig: (var, {ref_name: tgt})} # var_sig = 'varsort|{VARPROPS}' nd = defaultdict(list) for ep in x.eps(): ep_sig = '{}|{}'.format(ep[1].string, ep[3].get(CONSTARG_ROLE)) ep_out = {'LBL': ep[2]} for rargname, val in ep[3].items(): if rargname == CONSTARG_ROLE: continue ep_out[rargname] = val nd[ep_sig].append((ep[0], ep_out)) for var, vd in x._vars.items(): if varprops: vps = '|'.join('%s=%s' % (p, v) for p, v in vd['props']) else: vps = '' refs = '|'.join( '%s:%s' % (x._eps[tgt][1].string, ref) for ref, tgts in vd['refs'].items() for tgt in tgts if tgt in x._eps ) var_sig = '{}|{}|{}'.format(var_sort(var), vps, refs) var_tgts = {} if var in x._hcons: hc = x._hcons[var] var_tgts[hc[1]] = hc[2] nd[var_sig].append((var, var_tgts)) return nd def _node_isomorphic(agenda, a_sigs, b_nd, isomap, failed): agendum = None for i, agendum in enumerate(agenda): # agendum is (node, signature, edgedict) if agendum[0] not in isomap or agendum[2]: break else: agendum = None if agendum is None: # nothing left to do; success yield isomap return a, sig, a_edges = agendum agenda = agenda[i+1:] # remove all skipped agenda # print('agenda', agenda) # print('isomap', isomap) # print('failed', failed) # print('a_sigs', a_sigs) # print('\t', a) b_mapped = set(isomap.values()) bs = [b for b in b_nd[sig] if b[0] not in b_mapped] if any(failed.get((a, b[0])) for b in bs): return # print('bs', bs) for b, b_edges in bs: if len(a_edges) != len(b_edges): failed[(a, b)] = True; continue newmaps = [(a, b)] for edge, a_ in a_edges.items(): if a_ in isomap: continue a_sig = a_sigs[a_] b_avail = set(b_data[0] for b_data in b_nd[a_sig]) b_ = b_edges.get(edge) if b_ is None or b_ not in b_avail: failed[(a, b)] = True; newmaps = None; break newmaps.append((a_, b_)) if newmaps: iso = isomap.copy() iso.update(newmaps) for iso_ in _isomorphic(agenda, a_sigs, b_nd, iso, failed): yield iso_ def _var_isomorphic(a, b, check_varprops=True): """ Two Xmrs objects are isomorphic if they have the same structure as determined by variable linkages between preds. """ # first some quick checks if len(a.eps()) != len(b.eps()): return False if len(a.variables()) != len(b.variables()): return False #if len(a.hcons()) != len(b.hcons()): return False # pre-populate varmap; first top variables varmap = {} for pair in [(a.top, b.top), (a.index, b.index), (a.xarg, b.xarg)]: if pair != (None, None): v1, v2 = pair if None in pair: return False if check_varprops and a.properties(v1) != b.properties(v2): return False varmap[v1] = v2 # find permutations of variables, grouped by those that share the # same signature. a_sigs = defaultdict(list) for var, vd in a._vars.items(): if var not in varmap: var_sig = _isomorphic_var_signature(vd, a, check_varprops) a_sigs[var_sig].append(var) b_sigs = defaultdict(list) tgtmapped = set(varmap.values()) for var, vd in b._vars.items(): if var not in tgtmapped: var_sig = _isomorphic_var_signature(vd, b, check_varprops) b_sigs[var_sig].append(var) candidates = [] for sig, a_vars in sorted(a_sigs.items(), key=lambda x: len(x[1])): b_vars = b_sigs.get(sig, []) if len(a_vars) != len(b_vars): return False print(sig, a_vars, b_vars) candidates.append((a_vars, b_vars)) varmaps = _var_isomorphic(a, b, candidates, varmap) # double check HCONS (it's hard to do with var signatures) for vm in varmaps: if all(vm[lo] == b._hcons.get(vm[hi], (None, None, None))[2] for hi, _, lo in a.hcons()): return True return False def _var_isomorphic(a, b, candidates, varmap): if not candidates: yield varmap return a_vars, b_vars = candidates[0] candidates = candidates[1:] # don't pop() because it changes the original perms = [list(zip(a_perm, b_vars)) for a_perm in permutations(a_vars)] for perm in perms: vm = varmap.copy() mv = dict((v, k) for k, v in varmap.items()) # mv is vm reversed for a_var, b_var in perm: if vm.get(a_var) != mv.get(b_var): vm = None; break else: vm[a_var] = b_var if not vm: continue for vm_ in _var_isomorphic(a, b, candidates, vm): yield vm_ def _isomorphic_var_signature(vd, xmrs, check_varprops): sig = [] if check_varprops: props = vd['props'] sig.extend('%s=%s' % (k, v) for k, v in props) if 'hcons' in vd: sig.append('%s>' % vd['hcons'][1]) if 'icons' in vd: sig.append('%s>' % vd['icons'][1]) for role, refval in vd['refs'].items(): if role in ('hcons', 'icons'): for c in refval: sig.append('<%s' % c[1]) # *cons relation only else: for nid in refval: pred = xmrs.pred(nid) sig.append('%s:%s' % (pred.short_form(), role)) return ' '.join(sorted(sig))
[docs]def compare_bags(testbag, goldbag, count_only=True): """ Compare two bags of Xmrs objects, returning a triple of (unique in test, shared, unique in gold). Args: testbag: An iterable of Xmrs objects to test. goldbag: An iterable of Xmrs objects to compare against. count_only: If True, the returned triple will only have the counts of each; if False, a list of Xmrs objects will be returned for each (using the ones from testbag for the shared set) Returns: A triple of (unique in test, shared, unique in gold), where each of the three items is an integer count if the count_only parameter is True, or a list of Xmrs objects otherwise. """ gold_remaining = list(goldbag) test_unique = [] shared = [] for test in testbag: gold_match = None for gold in gold_remaining: if isomorphic(test, gold): gold_match = gold break if gold_match is not None: gold_remaining.remove(gold_match) shared.append(test) else: test_unique.append(test) if count_only: return (len(test_unique), len(shared), len(gold_remaining)) else: return (test_unique, shared, gold_remaining)