;; ;; Compute the longest common subsequence of two sequences ;; ;; Copyright 2007-2010 Ivan Raikov. ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of the ;; License, or (at your option) any later version. ;; This program is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; A full copy of the GPL license can be found at ;; . (module npdiff (export diffop? Insert Remove Change make-npdiff make-hunks) (import scheme chicken data-structures ) (require-extension srfi-1 srfi-4 datatype ) (include "box.scm") (include "stack.scm") (define (psplit2 lst) (values (car lst) (cdr lst))) (define (intpair? x) (and (pair? x) (fixnum? (car x)) (fixnum? (cdr x)))) ; Datatype: diffop ; ; A representation of the three diff operations; insert, remove, change. ; ; TARGET is the line or range of lines that is being operated on ; ; SOURCE is the range of lines that is used as input of the insert and ; change commands. ; ; DATA, DATAIN, DATAOUT is a sequence of the ; elements (e.g. lines) that are being inserted or replaced. ; ; CONTEXT, CONTEXTIN, CONTEXTOUT is optional context; these are pairs ; in which the car is a list of elements preceding the operation, and ; the cdr is a list of elements following the operation. ; (define-datatype diffop diffop? (Insert (target fixnum?) (source intpair?) (seq list?) (context (lambda (x) (or (not x) (list? x)))) ) (Remove (target intpair?) (seq list?) (context (lambda (x) (or (not x) (list? x)))) ) (Change (target intpair?) (source intpair?) (seqin list?) (seqout list?) (contextin (lambda (x) (or (not x) (list? x)))) (contextout (lambda (x) (or (not x) (list? x)))))) (define-record-printer (diffop x out) (cases diffop x (Insert (target source seq context) (display "#(Insert" out) (display (conc " target=" target) out) (display (conc " source=" source) out) (display (conc " seq=" seq) out) (display (conc " context=" context) out) (display ")" out)) (Remove (target seq context) (display "#(Remove " out) (display (conc " target=" target) out) (display (conc " seq=" seq) out) (display (conc " context=" context) out) (display ")" out)) (Change (target source seqin seqout contextin contextout) (display "#(Change" out) (display (conc " target=" target) out) (display (conc " source=" source) out) (display (conc " seqin=" seqin) out) (display (conc " seqout=" seqout) out) (display (conc " contextin=" contextin) out) (display (conc " contextout=" contextout) out) (display ")" out)))) ;; ;; ;; S. Wu, U. Manber, and E. Myers. An O(NP) sequence comparison ;; algorithm. In Information Processing Letters, volume 35, pages ;; 317--323, September 1990. ;; ;; (define (make-npdiff equal? is-ref is-length hunks) (lambda (A B . rest) (let-optionals rest ((context-len 0)) (define css (make-stack)) (let ((M (is-length A)) (N (is-length B))) (let-values (((A B M N swap) (if (fx> M N) (values B A N M #t) (values A B M N #f)))) ;; The algorithm outlined in the paper calls for the creation ;; of an array that contains the furthest paths, and that is ;; defined as [-(M+1),(N+1)]. ;; Since the vector library in Scheme does not support negative ;; array indices, we are going to have to bump everything by ;; offset M+1 whenever accessing array FP (define (compare delta offset fp p) (define (update k) (s32vector-set! fp (fx+ k offset) (snake k (max (fx+ 1 (s32vector-ref fp (fx+ offset (fx- k 1)))) (s32vector-ref fp (fx+ offset (fx+ 1 k))))))) (define (lowerloop k) (if (fx<= k (fx- delta 1)) (begin (update k) (lowerloop (fx+ 1 k))))) (define (upperloop k) (if (fx>= k (fx+ 1 delta)) (begin (update k) (upperloop (fx- k 1))))) (let ((p (fx+ p 1))) (lowerloop (fx* -1 p)) (upperloop (fx+ delta p)) (update delta) (if (not (fx= N (s32vector-ref fp (fx+ offset delta)))) (compare delta offset fp p)))) (define (snake k y) (let ((a (fx- y k)) (b y)) (let-values ((( x y ) (let loop ((x a) (y b)) (if (and (fx< x M) (fx< y N) (equal? (is-ref A x) (is-ref B y))) (loop (fx+ 1 x) (fx+ y 1)) (values x y))))) (if (or (not (fx= a x)) (not (fx= b y))) (let-values (((lasta lastb) (if (stack-empty? css) (values -1 -1) (psplit2 (car (stack-rest css)))))) (if (and (fx< lasta a) (fx< lastb b)) ;; we have found a common substring; push the end ;; and start pairs onto the common substring stack (if swap (begin (stack-push! css (cons b a)) (stack-push! css (cons y x))) (begin (stack-push! css (cons a b)) (stack-push! css (cons x y))))))) y))) (let ((offset (fx+ 1 M)) (fp (make-s32vector (fx+ 3 (fx+ M N)) -1)) (delta (fx- N M)) (p -1)) (compare delta offset fp p) (if swap (values (hunks B A css context-len) B A) (values (hunks A B css context-len) A B)))))))) ;; Pop matching pairs from the given stack, and fill in the gaps ;; between them with insert/change/remove hunks. ;; ;; This function expects the following stack layout: ;; ;; endpair n ;; startpair n ;; endpair n-1 ;; startpair n-1 ;; . ;; . ;; . ;; endpair 1 ;; startpair 1 ;; ;; i.e. the one constructed by function `npdiff' above. endpair ;; marks the end of a common substring. startpair marks the beginning ;; of a common substring. Each pair has the form (x,y) where x is a ;; line number in text A, and y is a line number in text B. ;; ;; If substring n (i.e. the one at the top of the stack) does not ;; reach the last line of text A (its endpair does NOT have the last ;; line number in A as x coordinate) that means we have some extra ;; lines at the end of text A that need to be removed, so we make a ;; remove hunk for them. If instead the y component does not reach ;; the end of B, we make an insert hunk. ;; ;; If substring 1 (i.e. the one at the bottom of the stack) does not ;; start from the first line of text A (its endpair does NOT have 0 ;; as y coordinate) that means we have some extra lines at the ;; beginning of text B that need to be inserted, so we make an insert ;; hunk for them. If instead the x component is non-zero, we make a ;; remove hunk. ;; ;; For all other cases, we make change hunks that fill in the gaps ;; between any two common substrings. (define (make-hunks is-ref is-length is-slice) (lambda (A B css . rest) (let-optionals rest ((context-len 0)) (let ((M (is-length A)) (N (is-length B)) (context? (fx> context-len 0))) (define (make-context seq len start end) (if (or (fx> start len) (fx< end start)) (list) (let ((start (if (fx< start 0) 0 start)) (end (if (fx< len end) len end))) (is-slice seq start end)))) (define (loop css hunks) (if (stack-empty? css) hunks ;; make a change hunk and recurse (let-values (((endpair startpair) (stack-ppeek css))) (let ((k (stack-depth css))) (let-values (((x y) (psplit2 startpair)) ((w z) (psplit2 endpair))) ;; are these the the last two elements of the stack? (if (fx= 1 k) (cond ((and (fx= 0 x) (fx= 0 y)) hunks) ((fx= 0 x) (cons (Insert x (cons 0 y) (is-slice B 0 y) (and context? (cons (list) (make-context B N y (fx+ y context-len))))) hunks)) ((fx= 0 y) (cons (Remove (cons 0 x) (is-slice A 0 x) (cons (list) (make-context A M x (fx+ x context-len)))) hunks)) (else (cons (Change (cons 0 x) (cons 0 y) (is-slice B 0 y) (is-slice A 0 x) (and context? (cons (list) (make-context B N y (fx+ y context-len)))) (and context? (cons (list) (make-context A M x (fx+ x context-len))))) hunks))) (begin (stack-pop! css) (stack-pop! css) (let-values (((w z) (values x y)) ((x y) (psplit2 (stack-peek css)))) (let ((newhunk (cond ((fx= y z) (Remove (cons x w) (is-slice A x w) (and context? (cons (make-context A M (fx- x context-len) x) (make-context A M x (fx+ x context-len)))))) ((fx= x w) (Insert x (cons y z) (is-slice B y z) (and context? (cons (make-context B N (fx- y context-len) y) (make-context B N z (fx+ z context-len)))))) (else (Change (cons x w) (cons y z) (is-slice B y z ) (is-slice A x w) (and context? (cons (make-context B N (fx- y context-len) y) (make-context B N z (fx+ z context-len)))) (and context? (cons (make-context A M (fx- x context-len) x) (make-context A M w (fx+ w context-len))))))))) ;; (match hunks ;; ((h . rest) (loop css (merge newhunk h rest))) (loop css (if newhunk (cons newhunk hunks) hunks))))))))))) (if (stack-empty? css) (cond ((and (zero? M) (zero? N)) ;; both sequences are empty (list)) ((zero? M) ;; sequence A is empty (list (Insert 0 (cons 0 N) (is-slice B 0 N) (and context? `(()))))) ((zero? N) ;; sequence B is empty (list (Remove (cons 0 M) (is-slice A 0 M) (and context? `(()))))) ;; the two sequences are completely different (list (Change (cons 1 M) (cons 1 N) (is-slice B 0 N) (is-slice A 0 M) (and context? (cons (make-context B N 0 N) (list))) (and context? (cons (make-context A M 0 M) (list))))) ) (let-values (((endpair startpair) (stack-ppeek css))) (let ((k (stack-depth css))) (let-values (((x y) (psplit2 startpair)) ((w z) (psplit2 endpair))) (cond ((and (fx= w M) (fx= z N)) (loop css (list))) ((fx= z N) (loop css (list (Remove (cons w M) (is-slice A w M) (and context? (cons (make-context A M w (fx- w context-len)) (list))))))) ((fx= w M) (loop css (list (Insert w (cons z N) (is-slice B z N) (and context? (cons (make-context B N (fx- z context-len) z) (list))))))) (else (loop css (list (Change (cons w M) (cons z N) (is-slice B z N ) (is-slice A w M) (and context? (cons (make-context B N (fx- z context-len) z) (list))) (and context? (cons (make-context A M (fx- w context-len) w) (list))) ))) )) )) )))))) )