;;; -*- Hen -*- ;;;; levenshtein-vector-functor.scm ;;;; Kon Lovett, Sep 16 2005 ;;;; Kon Lovett, Apr 2012 (include "levenshtein-cost-interface") (functor (levenshtein-vector-functor (CO COST-OPER)) (;export levenshtein-distance/vector*) (import scheme chicken CO) (use data-structures srfi-1 srfi-63 vector-lib moremacros numeric-macros type-checks type-errors levenshtein-operators) ;;; ;; (define (vector-minimum vec) (let ((len (vector-length vec))) (if (fx= 0 len) (values #f #f) (let ((idx 0) (val (vector-ref vec 0)) ) (do ((i 1 (fx++ i))) ((fx= i len) (values val idx)) (let ((nval (vector-ref vec i))) (unless (cost-less-than val nval) (set! idx i) (set! val nval) ) ) ) ) ) ) ) ;; (define (levenshtein/vector sv tv n m cv av lv elm-eql perf finf) (let ((wrkvec (make-vector (fx++ m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1)) ) (do ((k 0 (fx++ k)) (cst 0 (cost-add cst inscst))) ((fx> k m)) (vector-set! wrkvec k cst) ) (let ((next #f)) (do ((i 0 (fx++ i))) ((fx= i n) (finf next)) (let ((s@i (vector-ref sv i))) (do ((j 0 (fx++ j)) (cur (cost-multiply delcst (fx++ i)) next)) ((fx= j m) (vector-set! wrkvec m next)) (let ((apply-oper (lambda (eo-i cost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i))) (if (fx= a 0) ; A L (if (fx= l 0) ; - - (cost-add cost (vector-ref wrkvec j)) ; 0 0 (cost-add cost (vector-ref wrkvec (fx++ j)))); 0 1 (if (fx= l 0) ; (cost-add cost cur) ; 1 0 (let ((cost@ (vector-ref wrkvec j))) ; (if (elm-eql s@i (vector-ref tv j)) ; 1 1 cost@ (cost-add cost cost@))))))))) (let-values (((cost index) (vector-minimum (vector-map apply-oper cv)))) (perf i j index cost) (set! next cost) (vector-set! wrkvec j cur) ) ) ) ) ) ) ) ) ;; (define (levenshtein/matrix sv tv n m cv av lv elm-eql perf finf) (let ((mat (make-array '#() (fx++ n) (fx++ m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1))) (do ((j 0 (fx++ j)) (cst 0 (cost-add cst inscst))) ((fx> j m)) (array-set! mat cst 0 j) ) (do ((i 1 (fx++ i))) ((fx> i n) (finf (array-ref mat n m))) (array-set! mat (cost-multiply i delcst) i 0) (let* ((i-1 (fx-- i)) (s@i (vector-ref sv i-1)) ) (do ((j 1 (fx++ j))) ((fx> j m)) (let* ((j-1 (fx-- j)) (t-j (vector-ref tv j-1)) ) (let ((apply-oper (lambda (eo-i opercost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i)) ) ; Must be within bounds of matrix (if (and (fx>= i a) (fx>= j l)) (let ((currcost (array-ref mat (fx- i a) (fx- j l)))) (cond ; Allow a no-op cost <> 0 ((and (fx= a 0) (fx= l 0)) (cost-add currcost opercost)) ; Special case w/ test, simplified ((and (fx= a 1) (fx= l 1)) (if (elm-eql s@i t-j) currcost (cost-add currcost opercost))) ; General case w/ test ((or (fx> a 1) (fx> l 1)) (let ((x (cost-add currcost opercost))) (unless (elm-eql (vector-ref sv (fx- i a)) t-j) (set! x (cost-add x opercost))) (unless (elm-eql s@i (vector-ref tv (fx- j l))) (set! x (cost-add x opercost))) x)) ; Otherwise a = 0|1 & l = 1|0 (else (cost-add currcost opercost) ) ) ) ; Does this make sense when operation would violate mat bounds? cost-positive-infinity))))) (let-values (((cost index) (vector-minimum (vector-map apply-oper cv)))) ; Performed operation matrix is 0-based (perf i-1 j-1 index cost) (array-set! mat cost i j) ) ) ) ) ) ) ) ) ;;; #| Discussion - Inputs - UTF8 character encoding String, Vector, List Edit Operation Specification Linear Gap Cost Specification - Run of insertions (or deletions) of length x, has a cost of ax+b, for constants a and b. If b>0, this penalises numerous short runs of insertions and deletions. Cost limiting Common prefix/suffix stripping Outputs - Total Cost Cost of Each Performed Operation Each Performed Operation Linear Gap Cost |# (define (levenshtein-distance/vector* srcvec trgvec #!rest operlist #!key operations (elm-eql char=?)) (check-vector 'levenshtein-distance/vector* srcvec "source") (check-vector 'levenshtein-distance/vector* trgvec "target") (check-procedure 'levenshtein-distance/vector* elm-eql "elm-eql") ; Note that the edit-distance procedures ; return via the (finf) procedure. (let* ((operlist (filter levenshtein-operator? operlist)) ;only opers (opervec (levenshtein-base-operators-vector)) (insoper (vector-ref opervec 0)) ) ; List of edit operations? (unless (null? operlist) ; Verify valid operators (for-each (cut check-levenshtein-operator 'levenshtein-distance/vector* <>) operlist) (set! opervec (list->vector operlist)) ; Insert operator must be 1st in vector (let ((idx (vector-index levenshtein-insert-operator? opervec))) (cond ((not idx) (set! idx (vector-length opervec)) (set! opervec (vector-append opervec (vector insoper))) (vector-swap! opervec 0 idx)) ((fx> idx 0) (vector-swap! opervec 0 idx)))) (set! insoper (vector-ref opervec 0)) ; Delete operator must be 2nd in vector (let ((idx (vector-index levenshtein-delete-operator? opervec))) (cond ((not idx) (set! idx (vector-length opervec)) (set! opervec (vector-append opervec (vector (vector-ref (levenshtein-base-operators-vector) 1)))) (vector-swap! opervec 1 idx)) ((fx> idx 1) ;can't be zero, see above (vector-swap! opervec 1 idx))))) ; Setup for no operation introspection (let ((srclen (vector-length srcvec)) (trglen (vector-length trgvec)) (perf void) ; Perform operation accumulate (finf identity) ; Finish (zrtf ; Zero-length, assume 'finf' binding to 'identity' (lambda (len) (cost-multiply len (levenshtein-operator-cost insoper))))) ; Use shorter as the target (when (fx< srclen trglen) (swap-set! srclen trglen) (swap-set! srcvec trgvec)) ; Setup callback & return procedures for performed operations matrix ; should caller want operation introspection (when operations (if (and (fx= 0 srclen) (fx= 0 trglen)) ;then degenerate case (set! zrtf (lambda (x) (values 0 (make-array '#() 0 0)))) ;else source or target non-empty (let ((pm #f) (pm-rows (fxmax 1 srclen)) (pm-cols (fxmax 1 trglen))) (set! pm (make-array '#() pm-rows pm-cols)) (set! perf (lambda (i j ovi cost) (array-set! pm (cons cost (vector-ref opervec ovi)) i j))) (set! finf (lambda (cost) (values cost pm))) (set! zrtf (lambda (len) (let ((io-c (levenshtein-operator-cost insoper))) (do ((i 0 (fx++ i)) (i-cost io-c (cost-add i-cost io-c))) ((fx= i pm-rows)) (do ((j 0 (fx++ j)) (j-cost i-cost (cost-add j-cost io-c))) ((fx= j pm-cols)) (perf i j 0 j-cost))) (finf (cost-multiply io-c len)))))))) ; Unpack edit operation offsets (let ((cstvec (vector-map (lambda (i eo) (levenshtein-operator-cost eo)) opervec)) (abvvec (vector-map (lambda (i eo) (levenshtein-operator-above eo)) opervec)) (lftvec (vector-map (lambda (i eo) (levenshtein-operator-left eo)) opervec))) ; Handle empty source/target special case, then choose algorithm based ; on complexity of edit operations (cond ((fx= 0 srclen) (zrtf trglen)) ((fx= 0 trglen) (zrtf srclen)) ((or (null? operlist) (every levenshtein-base-operator? operlist)) (levenshtein/vector srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf)) (else (levenshtein/matrix srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf))) ) ) ) ) ) ;functor levenshtein-vector-functor