;;;; levenshtein-vector-functor.scm -*- Scheme -*- ;;;; Kon Lovett, Jul '23 ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, Sep '05 #| Discussion - - Gauche https://practical-scheme.net/gauche/man/gauche-refe/Levenshtein-edit-distance.html#Levenshtein-edit-distance - (below) Inputs - X UTF8 character encoding - levenshtein-sequence-utf8 \ String, Vector, List - levenshtein-sequence-utf8/string/vector but no list X Edit Operation Specification - levenshtein-operators Linear Gap Cost Specification - Run of insertions (or deletions) of length x, has a cost of ax+b, for constants a and b. If b>0, this penalises numerous short runs of insertions and deletions. X Cost limiting Common prefix/suffix stripping Outputs - X Total Cost - result X Cost of Each Performed Operation - levenshtein-path-iterator X Each Performed Operation - levenshtein-path-iterator Linear Gap Cost |# (include-relative "levenshtein-cost-interface") (functor (levenshtein-vector-functor (CO COST-OPER)) (;export levenshtein-distance/vector*) (import scheme (chicken base) (chicken type) (chicken fixnum) (srfi 1) (srfi 63) vector-lib miscmacros (only (check-errors basic) define-check+error-type) levenshtein-operators CO) (define-type array (struct array)) (define-type levenshtein-operator (struct levenshtein-operator)) (define-type cost (not boolean)) (: levenshtein-distance/vector* (vector vector ;#!rest (list-of levenshtein-operator) #!key #:operations (list-of levenshtein-operator) #:elm-eql (* * -> boolean) #:limit-cost (or false cost) -> (or false cost) (or false array))) ;; (define-check+error-type vector) (define-check+error-type procedure) ;; ;moremacros (define-syntax swap! (syntax-rules () ((swap! ?a ?b) (let ((_tmp ?a)) (set! ?a ?b) (set! ?b _tmp)) ) ) ) ;fx-inlines (define-inline (fxzero? x) (fx= 0 x)) (define-inline (fxadd1 x) (fx+ x 1)) (define-inline (fxsub1 x) (fx- x 1)) ;; #; ;cleaner but vector-fold not inlined (define (vector-minimum-cost vec) (define (minelmf i pr nval) (unless (cost-less-than (car pr) nval) (set-car! pr nval) (set-cdr! pr i) ) pr ) (if (fxzero? (vector-length vec)) (values #f #f) (let ((pr (vector-fold minelmf `(,cost-positive-infinity . #f) vec))) (values (car pr) (cdr pr)) ) ) ) (define (vector-minimum-cost vec) (let ((len (vector-length vec))) (if (fxzero? len) (values #f #f) (let ((idx 0) (val (vector-ref vec 0)) ) (do ((i (the fixnum 1) (fxadd1 i))) ((fx= i len) (values val idx)) (let ((nval (vector-ref vec i))) (unless (cost-less-than val nval) (set! val nval) (set! idx i) ) ) ) ) ) ) ) ;; (define (levenshtein/vector sv tv n m cv av lv elm-eql perf finf limit-cost) (let ((wrkvec (make-vector (fxadd1 m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1)) ) (do ((k (the fixnum 0) (fxadd1 k)) (cst (fixnum->cost 0) (cost-add cst inscst))) ((fx> k m)) (vector-set! wrkvec k cst) ) ; "Early" return is needed (let/cc return (let ((next (the (or false fixnum) #f))) (do ((i (the fixnum 0) (fxadd1 i))) ((fx= i n) (finf next)) (let ((s@i (vector-ref sv i))) (do ((j (the fixnum 0) (fxadd1 j)) (cur (cost-multiply delcst (fixnum->cost (fxadd1 i))) next) ) ((fx= j m) (vector-set! wrkvec m next)) (let ((apply-oper (lambda (eo-i cost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i))) (if (fxzero? a) ; A L (if (fxzero? l) ; - - (cost-add cost (vector-ref wrkvec j)) ; 0 0 (cost-add cost (vector-ref wrkvec (fxadd1 j)))) ; 0 1 (if (fxzero? l) ; (cost-add cost cur) ; 1 0 (let ((cost@ (vector-ref wrkvec j))) ; (if (elm-eql s@i (vector-ref tv j)) ; 1 1 cost@ (cost-add cost cost@))))))))) (let-values (((cost index) (vector-minimum-cost (vector-map apply-oper cv)))) ; Quit when past limit (when (and limit-cost (cost-less-than limit-cost cost)) (call-with-values (cut finf #f) return)) (perf i j index cost) (set! next cost) (vector-set! wrkvec j cur) ) ) ) ) ) ) ) ) ) ;; (define (levenshtein/matrix sv tv n m cv av lv elm-eql perf finf limit-cost) (let ((mat (make-array '#() (fxadd1 n) (fxadd1 m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1))) (do ((j (the fixnum 0) (fxadd1 j)) (cst cost-zero (cost-add cst inscst))) ((fx> j m)) (array-set! mat cst 0 j) ) ; "Early" return is needed (let/cc return (do ((i (the fixnum 1) (fxadd1 i))) ((fx> i n) (finf (array-ref mat n m))) (array-set! mat (cost-multiply (fixnum->cost i) delcst) i 0) (let* ((i-1 (fxsub1 i)) (s@i (vector-ref sv i-1)) ) (do ((j (the fixnum 1) (fxadd1 j))) ((fx> j m)) (let* ((j-1 (fxsub1 j)) (t-j (vector-ref tv j-1)) ) (let ((apply-oper (lambda (eo-i opercost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i)) ) ; Must be within bounds of matrix (if (and (fx>= i a) (fx>= j l)) (let ((currcost (array-ref mat (fx- i a) (fx- j l)))) ; Allow a no-op cost <> 0 (cond ((and (fxzero? a) (fxzero? l)) (cost-add currcost opercost)) ; Special case w/ test, simplified ((and (fx= a 1) (fx= l 1)) (if (elm-eql s@i t-j) currcost (cost-add currcost opercost))) ; General case w/ test ((or (fx> a 1) (fx> l 1)) (let ((x (cost-add currcost opercost))) (unless (elm-eql (vector-ref sv (fx- i a)) t-j) (set! x (cost-add x opercost))) (unless (elm-eql s@i (vector-ref tv (fx- j l))) (set! x (cost-add x opercost))) x)) ; Otherwise a = 0|1 & l = 1|0 (else (cost-add currcost opercost) ) ) ) ; Does this make sense when operation would violate mat bounds? cost-positive-infinity))))) (let-values (((cost index) (vector-minimum-cost (vector-map apply-oper cv)))) ; Quit when past limit (when (and limit-cost (cost-less-than limit-cost cost)) (call-with-values (cut finf #f) return)) ; Performed operation matrix is 0-based (perf i-1 j-1 index cost) (array-set! mat cost i j) ) ) ) ) ) ) ) ) ) ;;; (define (levenshtein-distance/vector* srcvec trgvec #!rest operlist #!key operations (elm-eql char=?) (limit-cost (the (or false cost) #f))) (check-vector 'levenshtein-distance/vector* srcvec "source") (check-vector 'levenshtein-distance/vector* trgvec "target") (check-procedure 'levenshtein-distance/vector* elm-eql "elm-eql") ; Note that the edit-distance procedures ; return via the (finf) procedure. (let* ((operlist (filter levenshtein-operator? operlist)) ;only opers (opervec (levenshtein-extended-operators operlist)) ;Insert must be 1st, delete 2nd (insopr (vector-ref opervec 0)) ) ; Setup for no operation introspection (let ((srclen (vector-length srcvec)) (trglen (vector-length trgvec)) (perf void) ; Perform operation accumulate ;FIXME synth path-matrix w/ ins oper (finf ; Finish, "identity" (lambda (cost) (values cost #f))) (zrtf ; Zero-length, assume 'finf' binding to "identity" (lambda (len) (values (cost-multiply (fixnum->cost len) (levenshtein-operator-cost insopr)) #f))) ) ; Use shorter as the target (when (fx< srclen trglen) (swap! srclen trglen) (swap! srcvec trgvec)) ; Setup callback & return procedures for performed operations matrix ; should caller want operation introspection (when operations (if (and (fxzero? srclen) (fxzero? trglen)) ;then degenerate case (set! zrtf (lambda (x) (values 0 (make-array '#() 0 0)))) ;else source or target non-empty (let* ((pm-rows (max 1 srclen)) (pm-cols (max 1 trglen)) (pm (make-array '#() pm-rows pm-cols)) ) (set! perf (lambda (i j ovi cost) (array-set! pm (cons cost (vector-ref opervec ovi)) i j))) (set! finf (lambda (cost) (values cost pm))) (set! zrtf (lambda (len) (let ((io-c (levenshtein-operator-cost insopr))) (do ((i (the fixnum 0) (fxadd1 i)) (i-cost io-c (cost-add i-cost io-c))) ((fx= i pm-rows)) (do ((j (the fixnum 0) (fxadd1 j)) (j-cost i-cost (cost-add j-cost io-c))) ((fx= j pm-cols)) (perf i j 0 j-cost))) (finf (cost-multiply (fixnum->cost len) io-c))))))) ) ; Unpack edit operation offsets (let ((cstvec (vector-map (lambda (i eo) (levenshtein-operator-cost eo)) opervec)) (abvvec (vector-map (lambda (i eo) (levenshtein-operator-above eo)) opervec)) (lftvec (vector-map (lambda (i eo) (levenshtein-operator-left eo)) opervec))) ; Handle empty source/target special case, then choose algorithm based ; on complexity of edit operations (cond ((fxzero? srclen) (zrtf trglen)) ((fxzero? trglen) (zrtf srclen)) ((or (null? operlist) (every levenshtein-base-operator? operlist)) (levenshtein/vector srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf limit-cost)) (else (levenshtein/matrix srcvec trgvec srclen trglen cstvec abvvec lftvec elm-eql perf finf limit-cost))) ) ) ) ) ) ;functor levenshtein-vector-functor