;;;; levenshtein-vector-functor.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, Sep '05 (include "levenshtein-cost-interface") (functor (levenshtein-vector-functor (CO COST-OPER)) (;export levenshtein-distance/vector*) (import scheme) (import (chicken base)) (import (chicken type)) (import (srfi 1)) (import (srfi 63)) (import vector-lib) (import moremacros) (import numeric-macros) (import type-checks) (import type-errors) (import levenshtein-operators) (import CO) ;;; ;; Types (define-type array (struct array)) (: levenshtein-distance/vector* (vector vector #!rest -> number (or boolean array))) ;; (define (vector-minimum vec) (let ((len (vector-length vec))) (if (zero? len) (values #f #f) (let ((idx 0) (val (vector-ref vec 0)) ) (do ((i 1 (++ i))) ((= i len) (values val idx)) (let ((nval (vector-ref vec i))) (unless (cost-less-than val nval) (set! idx i) (set! val nval) ) ) ) ) ) ) ) ;; (define (levenshtein/vector sv tv n m cv av lv elm-eql perf finf) (let ((wrkvec (make-vector (++ m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1)) ) (do ((k 0 (++ k)) (cst 0 (cost-add cst inscst))) ((> k m)) (vector-set! wrkvec k cst) ) (let ((next #f)) (do ((i 0 (++ i))) ((= i n) (finf next)) (let ((s@i (vector-ref sv i))) (do ((j 0 (++ j)) (cur (cost-multiply delcst (++ i)) next)) ((= j m) (vector-set! wrkvec m next)) (let ((apply-oper (lambda (eo-i cost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i))) (if (zero? a) ; A L (if (= l 0) ; - - (cost-add cost (vector-ref wrkvec j)) ; 0 0 (cost-add cost (vector-ref wrkvec (++ j)))) ; 0 1 (if (zero? l) ; (cost-add cost cur) ; 1 0 (let ((cost@ (vector-ref wrkvec j))) ; (if (elm-eql s@i (vector-ref tv j)) ; 1 1 cost@ (cost-add cost cost@))))))))) (let-values (((cost index) (vector-minimum (vector-map apply-oper cv)))) (perf i j index cost) (set! next cost) (vector-set! wrkvec j cur) ) ) ) ) ) ) ) ) ;; (define (levenshtein/matrix sv tv n m cv av lv elm-eql perf finf) (let ((mat (make-array '#() (++ n) (++ m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1))) (do ((j 0 (++ j)) (cst 0 (cost-add cst inscst))) ((> j m)) (array-set! mat cst 0 j) ) (do ((i 1 (++ i))) ((> i n) (finf (array-ref mat n m))) (array-set! mat (cost-multiply i delcst) i 0) (let* ((i-1 (-- i)) (s@i (vector-ref sv i-1)) ) (do ((j 1 (++ j))) ((> j m)) (let* ((j-1 (-- j)) (t-j (vector-ref tv j-1)) ) (let ((apply-oper (lambda (eo-i opercost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i)) ) ; Must be within bounds of matrix (if (and (>= i a) (>= j l)) (let ((currcost (array-ref mat (- i a) (- j l)))) (cond ; Allow a no-op cost <> 0 ((and (zero? a) (zero? l)) (cost-add currcost opercost)) ; Special case w/ test, simplified ((and (= a 1) (= l 1)) (if (elm-eql s@i t-j) currcost (cost-add currcost opercost))) ; General case w/ test ((or (> a 1) (> l 1)) (let ((x (cost-add currcost opercost))) (unless (elm-eql (vector-ref sv (- i a)) t-j) (set! x (cost-add x opercost))) (unless (elm-eql s@i (vector-ref tv (- j l))) (set! x (cost-add x opercost))) x)) ; Otherwise a = 0|1 & l = 1|0 (else (cost-add currcost opercost) ) ) ) ; Does this make sense when operation would violate mat bounds? cost-positive-infinity))))) (let-values (((cost index) (vector-minimum (vector-map apply-oper cv)))) ; Performed operation matrix is 0-based (perf i-1 j-1 index cost) (array-set! mat cost i j) ) ) ) ) ) ) ) ) ;;; #| Discussion - Inputs - UTF8 character encoding String, Vector, List Edit Operation Specification Linear Gap Cost Specification - Run of insertions (or deletions) of length x, has a cost of ax+b, for constants a and b. If b>0, this penalises numerous short runs of insertions and deletions. Cost limiting Common prefix/suffix stripping Outputs - Total Cost Cost of Each Performed Operation Each Performed Operation Linear Gap Cost |# (define (levenshtein-distance/vector* srcvec trgvec #!rest operlist #!key operations (elm-eql char=?)) (check-vector 'levenshtein-distance/vector* srcvec "source") (check-vector 'levenshtein-distance/vector* trgvec "target") (check-procedure 'levenshtein-distance/vector* elm-eql "elm-eql") ; Note that the edit-distance procedures ; return via the (finf) procedure. (let* ((operlist (filter levenshtein-operator? operlist)) ;only opers (opervec (levenshtein-base-operators-vector)) (insoper (vector-ref opervec 0)) ) ; List of edit operations? (unless (null? operlist) ; Verify valid operators (for-each (cut check-levenshtein-operator 'levenshtein-distance/vector* <>) operlist) (set! opervec (list->vector operlist)) ; Insert operator must be 1st in vector (let ((idx (vector-index levenshtein-insert-operator? opervec))) (cond ((not idx) (set! idx (vector-length opervec)) (set! opervec (vector-append opervec (vector insoper))) (vector-swap! opervec 0 idx)) ((positive? idx) (vector-swap! opervec 0 idx)))) (set! insoper (vector-ref opervec 0)) ; Delete operator must be 2nd in vector (let ((idx (vector-index levenshtein-delete-operator? opervec))) (cond ((not idx) (set! idx (vector-length opervec)) (set! opervec (vector-append opervec (vector (vector-ref (levenshtein-base-operators-vector) 1)))) (vector-swap! opervec 1 idx)) ((> idx 1) ;can't be zero, see above (vector-swap! opervec 1 idx))))) ; Setup for no operation introspection (let ((srclen (vector-length srcvec)) (trglen (vector-length trgvec)) (perf void) ; Perform operation accumulate (finf ; Finish, "identity" (lambda (cost) (values cost #f))) (zrtf ; Zero-length, assume 'finf' binding to "identity" (lambda (len) (cost-multiply len (levenshtein-operator-cost insoper))))) ; Use shorter as the target (when (< srclen trglen) (swap! srclen trglen) (swap! srcvec trgvec)) ; Setup callback & return procedures for performed operations matrix ; should caller want operation introspection (when operations (if (and (zero? srclen) (zero? trglen)) ;then degenerate case (set! zrtf (lambda (x) (values 0 (make-array '#() 0 0)))) ;else source or target non-empty (let ((pm #f) (pm-rows (max 1 srclen)) (pm-cols (max 1 trglen))) (set! pm (make-array '#() pm-rows pm-cols)) (set! perf (lambda (i j ovi cost) (array-set! pm (cons cost (vector-ref opervec ovi)) i j))) (set! finf (lambda (cost) (values cost pm))) (set! zrtf (lambda (len) (let ((io-c (levenshtein-operator-cost insoper))) (do ((i 0 (++ i)) (i-cost io-c (cost-add i-cost io-c))) ((= i pm-rows)) (do ((j 0 (++ j)) (j-cost i-cost (cost-add j-cost io-c))) ((= j pm-cols)) (perf i j 0 j-cost))) (finf (cost-multiply io-c len)))))))) ; Unpack edit operation offsets (let ((cstvec (vector-map (lambda (i eo) (levenshtein-operator-cost eo)) opervec)) (abvvec (vector-map (lambda (i eo) (levenshtein-operator-above eo)) opervec)) (lftvec (vector-map (lambda (i eo) (levenshtein-operator-left eo)) opervec))) ; Handle empty source/target special case, then choose algorithm based ; on complexity of edit operations (cond ((zero? srclen) (zrtf trglen)) ((zero? trglen) (zrtf srclen)) ((or (null? operlist) (every levenshtein-base-operator? operlist)) (levenshtein/vector srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf)) (else (levenshtein/matrix srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf))) ) ) ) ) ) ;functor levenshtein-vector-functor