;************************************************************************ ; Non-linear matching and efficient (cyclic) unification ; on ordered trees (terms) ; ; One (in case of matching) or both (for unification) trees may ; contain variables, whose names begin with an underscore (see the ; pattern-var? predicate). A variable whose name is a lone underscore ; _ is an anonymous, wildcard variable, like the one in Prolog. ; ; The result of matching or unification is a set of bindings of ; pattern variables to terms. We call this set 'environment' (env). ; In the term re-writing literature such a set of bindings is usually ; called 'substitutions'. Procedures 'match-tree' and 'unify-trees' ; accept the initial environment as an argument and return the ; resulting environment -- the substitutions to make both trees ; identical. The initial environment argument lets the user pass the ; existing substitutions so the user can match or unify a set of terms ; incrementally. Often though the initial environment is '(). If the ; match or unification fail, they return #f. ; ; Besides match-tree and unify-trees procedures, this file defines ; procedures that apply substitutions to terms and ancillary functions. ; (define (pattern-var? x) (and (symbol? x) (char=? #\$ (string-ref (symbol->string x) 0)))) ; A non-linear match of two ordered trees, one of which may contain variables ; ; match-tree TREE1 TREE2 ENV -> ENV ; ; TREE1 may contain variables (as decided by the pattern-var? ; predicate above). TREE1 may contain several occurrences of the same ; variable. All these occurrences must match the same value. A ; variable match is entered into the environment. A variable _ is an ; exception: its match is never entered into the environment. The ; function returns the resulting environment or #f if the match fails. (define (match-tree tree1 tree2 env) (cond ((eq? tree1 tree2) env) ; terms are identical ((pair? tree1) ; Recursively match pairs (and-let* ( ((pair? tree2)) (env-new (match-tree (car tree1) (car tree2) env))) (match-tree (cdr tree1) (cdr tree2) env-new))) ((null? tree1) #f) ; if tree2 had been '(), see 1st cond ((eq? '_ tree1) env) ; _ matches everything ((pattern-var? tree1) (cond ((assq tree1 env) => (lambda (prev-binding) ; variable occurred before (if (eq? (cdr prev-binding) tree2) env ; try eq? first, it's faster (and (equal? (cdr prev-binding) tree2) env)))) (else (cons (cons tree1 tree2) env)) ; new variable, enter fresh binding )) (else (and (equal? tree1 tree2) env)))) ; A match/unification of two ordered trees, both of which may contain ; variables. The unifier is capable of finding cyclic substitutions. ; unify-trees TREE1 TREE2 ENV -> ENV ; ; The trees may contain several occurrences of the same variable. All ; these occurrences must match the same value. A variable match is ; entered into the environment. A variable _ is an exception: its ; match is never entered into the environment. ; The function unify-trees returns the resulting environment or #f if the ; unification fails. ; ; The semantics of unification: ; if A and B are two terms and (unify-trees A B init-env) succeeds and ; yields E, which is _acyclic_, then ; (match-tree (substitute-vars-recursively A E) ; (substitute-vars-recursively B E) ; '()) ; succeeds. ; ; The code isn't too dumb: it does not suffer from the exponential ; blow-up in time and in space, as is the case with the conventional ; unification algorithm (recursive descent with the composition of ; substitutions) due to Robinson, 1965. In fact, the algorithm below ; is linear in space: the size of the 'env' (the substitutions) is ; proportional to the size of the input trees. ; ; Often unify-trees function should be followed by normalize-subst ; below. Splitting the unification into two phases is for efficiency ; reasons: If we are interested only in the fact if two terms unify, ; we don't need to run normalize-subst. (define (genvar) (string->symbol (string-append "$" (symbol->string (gensym))))) (define (unify-trees tree1 tree2 env) (cond ((eq? tree1 tree2) env) ; especially helps if both tree1 & tree2 bound ((eq? '_ tree1) env) ; _ matches anything ((eq? '_ tree2) env) ; _ matches anything ((pattern-var? tree1) (cond ((assq tree1 env) => ; tree1 is a bound var (lambda (prev-binding) ;(cout "prev binding: " tree1 "," tree2 "," env nl) (unify-trees (cdr prev-binding) tree2 env))) ; tree1 is a free variable ((and (pattern-var? tree2) (assq tree2 env)) => ; tree2 is a bound var ; binding a free variable to a bound. Search for a substantial binding ; or a loop. If we find a loop tree1->tree2->...->tree1 ; then we do not enter the binding to tree1, because tree1 is not ; actually constrained. ; The other benefit of the loop below is that it prevents ; chains of bindings a->b, b->c, c->d ... d->nonvar ; We straighten out such chains. cf. union-find algorithm. (lambda (tree2-binding) (let loop ((binding tree2-binding)) (cond ((eq? tree1 (cdr binding)) env) ; loop: no binding needed ((and (pattern-var? (cdr binding)) (assq (cdr binding) env)) => loop) (else (cons (cons tree1 (cdr binding)) env)))))) ((pattern-var? tree2) ; tree2 is a free variable, tree2 /= tree1 (cons (cons tree1 tree2) env)) ((pair? tree2) ; tree1 is free, tree2 is a tree (and-let* ((car-var (genvar)) (cdr-var (if (null? (cdr tree2)) '() (genvar))) (car-env (unify-trees car-var (car tree2) (cons (cons tree1 (cons car-var cdr-var)) env)))) (unify-trees cdr-var (cdr tree2) car-env))) (else ; tree1 is free, tree2 is atomic (cons (cons tree1 tree2) env)))) ((pattern-var? tree2) (unify-trees tree2 tree1 env)) ((null? tree1) #f) ; tree2 is not a var, is not eq? tree1 ((pair? tree1) (and-let* (((pair? tree2)) (car-env (unify-trees (car tree1) (car tree2) env))) (unify-trees (cdr tree1) (cdr tree2) car-env))) (else (and (equal? tree1 tree2) env)))) ; Determine if 'pred' holds for every element of the list (define (every? pred lst) (or (null? lst) (and (pred (car lst)) (every? pred (cdr lst))))) ; apply a leaf-transformer to every leaf of the tree and return the result. ; If the leaf-transformer returns #f, the corresponding leaf is ; removed from the resulting tree ; See term-variables function below (define (tree-traverse leaf-transformer tree) (cond ((null? tree) tree) ((not (pair? tree)) (leaf-transformer tree)) (else (let ((new-car (tree-traverse leaf-transformer (car tree))) (new-cdr (or (tree-traverse leaf-transformer (cdr tree)) '()))) (if (and new-car (not (null? new-car))) (cons new-car new-cdr) new-cdr))))) ; Find all the vars in term and substitute them with values given ; the env. It's an error if a term contains an unbound var (define (substitute-vars term env) (tree-traverse (lambda (leaf) (cond ((not (pattern-var? leaf)) leaf) ((assq leaf env) => cdr) (else (error "undefined pattern var " leaf " in env " env)))) term)) ; The same as above but accept an open term (that is, a term ; with variables that are not bound in env). Leave the free variables ; as they are. (define (substitute-vars-open term env) (tree-traverse (lambda (leaf) (cond ((not (pattern-var? leaf)) leaf) ((assq leaf env) => cdr) (else leaf))) term)) ; Normalize substitutions: ; normalize-subst ENVI ACCEPT-CYCLIC? -> (ENVO | #f) ; ; ENVI is a set of bindings as returned by unify-trees. Each binding in ; ENVI should have the form ; var->var ; var->atom ; var->(cons var-or-atom var-or-atom) ; ; ENVO is, informally speaking, a transitive closure of ENVI. If ; ENVI is acyclic, then ENVO is the fixpoint of composing ENVI with ; itself. To be more precise, ENVO is a set of bindings. It contains ; as many bindings as ENVI. Each binding of ENVO has the form ; var -> term ; If ENVI is acyclic, then term is either variable-free or contains ; only the variables that were not bound in ENVI. If ENVI is cyclic, ; then term may contain variables that are bound in ENVI (and ENVO). ; If ENVI is cyclic, it may contain bindings that are: ; - simply cyclic: the result of unifying (f _y) and _y ; - simply mutually recursive: unifying (p _y _a) and (p (f _a) (g _y)) ; - complex mutually recursive: unifying ; (p _y _a) and (p (f _y _a) (g _y _a)) ; I don't know how deeply I should unroll mutually-recursive ; bindings. At present, normalize-subst does not unroll them at all. ; SWI Prolog seems to unroll cyclic substitutions a few steps when ; printing them out. Sometimes this approach works out, and sometimes ; it doesn't: ; [user]. ; eq(X,X). ; ; ?- eq(Y,f(Y)). ; Y = f(f(f(f(f(f(f(f(f(f(...)))))))))) % Looks good ; ; ?- eq(p(Y,A),p(f(A),g(Y))). ; Y = f(g(f(g(f(g(f(g(f(g(...)))))))))) % Still looks good ; A = g(f(g(f(g(f(g(f(g(f(...)))))))))) ; ; ?- eq(p(Y,A),p(f(Y,A),g(Y,A))). ; ...long-and-unwieldy output... ; ; If the flag ACCEPT-CYCLIC? is #f and ENVI turns out to be cyclic, we ; return #f. If ACCEPT-CYCLIC? is #t, we always return the ; environment, which may contain recursive and mutually recursive ; bindings. Setting the flag ACCEPT-CYCLIC? to #f essentially enables ; the occurs-check (see below). ; ; A careful reader might note that our unification algorithm does not ; have an explicit occurs-check. Many Prolog systems in fact omit the ; occurs check altogether for performance reasons: see SICTus Prolog ; Manual, p. 50. [http://citeseer.nj.nec.com/laboratory01sicstus.html] ; The manual states that unifying without the occurs-check is wrong ; but practical. In our algorithm however, cyclic and acyclic ; unifications have exactly the same time and space complexities. The ; occurs-check (passing #f as ACCEPT-CYCLIC?) does _not_ impose any ; penalty at all. ; ; Algorithm: breadth-first iterative composition of ENVI with itself ; until the fixpoint is reached or a set of mutually-recursive ; bindings is established. The tests below show that the algorithm has ; a very nice time and space complexities, even in pathological cases. ; The nice complexities stem from the fact that we avoid repeated ; traversals and rebuilding of terms searching for variables to ; substitute, as we did in the naive case. ; ; Because we store bindings in an associative list and use assq to ; locate a binding, we cannot generally expect a linear time ; complexity (although we do achieve it in many, including the ; pathological, cases). Storing bindings in a hash table will decrease ; the time complexity. ; ; It seems that our algorithm is somewhat similar to Gerard Huet's ; linear unification algorithm. The latter has quite different intuition: ; re-writing of equations into a canonical form. In a sense, ; our unify-trees generates equations and normalize-subst strongly ; normalizes them, in O(n) iterations. (define (normalize-subst env accept-cyclic?) ; Partition the env into ground bindings and composite bindings. ; init-ground-terms ENVI -> GROUND-ENV NONGR-ENV ; where each binding in GROUND-ENV is either ; var->atom or ; var->free_var ; where free_var is a variable that is not bound by ENVI ; Each binding in NONGR-ENV has the form ; var-> (cons var-or-atom var-or-atom) (define (init-ground-terms env) (let loop ((env env) (ground '()) (non-ground '())) (if (null? env) (values ground non-ground) (let* ((binding (car env)) (env (cdr env)) (var (car binding)) (term (cdr binding))) (cond ((pair? term) ; composite binding (loop env ground (cons binding non-ground))) ((pattern-var? term) (cond ((assq term ground) => (lambda (ground-binding) (loop env (cons (cons var (cdr ground-binding)) ground) non-ground))) ; This case isn't common as unify-trees tries to avoid ; bindings of the form var->bound_var ((assq term non-ground) => (lambda (non-ground-binding) (loop env ground (cons (cons var (cdr non-ground-binding)) non-ground)))) ; Reduce the chain var->bound_var. Again, it's uncommon ((assq term env) => (lambda (unk-binding) (loop (cons (cons var (cdr unk-binding)) env) ground non-ground))) ; term is a free pattern-var. It's considered ground (else (loop env (cons binding ground) non-ground)) )) (else ; term is an atom (loop env (cons binding ground) non-ground))))))) ; Perform one step of the environment composition ; keep-resolving GROUND-ENV NONGR-ENV -> GROUND-ENV' NONGR-ENV' CHANGED? ; We compose NONGR-ENV with GROUND-ENV, add the composed bindings to the ; output GROUND-ENV' and return the left-over bindings in NONGR-ENV'. ; A boolean flag CHANGED? tells the success of the composition. ; Each binding of NONGR-ENV has the form ; var-> (cons var-or-atom1 var-or-atom2) ; That binding can be composed with GROUND-ENV iff both var-or-atom1 and ; var-or-atom2 are (i) atoms, (ii) one is an atom and the other is a variable ; bound in GROUND-ENV, (iii) both are variables bound in GROUND-ENV. ; The composed binding becomes a ground binding and moved to GROUND-ENV'. (define (keep-resolving ground non-ground) (let loop ((env non-ground) (ground ground) (non-ground '()) (changed? #f)) (if (null? env) (values ground non-ground changed?) (let* ((binding (car env)) (env (cdr env)) (var (car binding)) (term1 (cadr binding)) (term2 (cddr binding)) (term1-var? (pattern-var? term1)) (term2-var? (pattern-var? term2)) (term1-gb (and term1-var? (assq term1 ground))) (term2-gb (and term2-var? (assq term2 ground)))) (cond ((and term1-gb term2-gb) ; successful composition (loop env (cons (cons var (cons (cdr term1-gb) (cdr term2-gb))) ground) non-ground #t)) ((or (eq? term1-var? (not term1-gb)) ; Could not find one (eq? term2-var? (not term2-gb))) ; ground binding (loop env ground (cons binding non-ground) changed?)) (term1-gb ; term2 is atom, term1 resolves to ground (loop env (cons (cons var (cons (cdr term1-gb) term2)) ground) non-ground #t)) (term2-gb ; term1 is atom, term2 resolves to ground (loop env (cons (cons var (cons term1 (cdr term2-gb))) ground) non-ground #t)) (else ; both term1 and term2 are atoms (loop env (cons binding ground) non-ground #t))))))) ; The fixpoint algorithm (define (iter ground non-ground changed?) (cond ((null? non-ground) ground) (changed? (call-with-values (lambda () (keep-resolving ground non-ground)) iter)) (accept-cyclic? ; potentially we may wish to unroll (append non-ground ground)) ; non-ground for a few steps (else #f))) ; Get the ball rolling... (call-with-values (lambda () (call-with-values (lambda () (init-ground-terms env)) keep-resolving)) iter) ) ; Give the list of all the vars in the term, with no duplicates (define (term-variables term) (define (flatten-unique lst dest) (cond ((null? lst) dest) ((pair? lst) (flatten-unique (cdr lst) (flatten-unique (car lst) dest))) ((memq lst dest) dest) (else (cons lst dest)))) (flatten-unique (tree-traverse (lambda (leaf) (and (pattern-var? leaf) leaf)) term) '())) ; Return a term where all the variable names are unique (define (alfa-convert term) (let* ((vars (term-variables term)) (new-vars (map (lambda (_) (genvar)) vars))) (substitute-vars term (map cons vars new-vars)))) ; Check if two trees can be unified without creating cyclic substitutions (define (unify-simple? tree1 tree2) (and-let* ((env (unify-trees tree1 tree2 '())) ((normalize-subst env #f))) #t))