;;;; levenshtein-vector.scm -*- Scheme -*- ;;;; Kon Lovett, Jul '23 ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, Sep '05 #| Discussion - Inputs - X UTF8 character encoding \ String, Vector, List Edit Operation Specification Linear Gap Cost Specification - Run of insertions (or deletions) of length x, has a cost of ax+b, for constants a and b. If b>0, this penalises numerous short runs of insertions and deletions. X Cost limiting Common prefix/suffix stripping Outputs - X Total Cost X Cost of Each Performed Operation X Each Performed Operation Linear Gap Cost |# (module levenshtein-vector (;export levenshtein-distance/vector*) (import scheme (chicken base) (chicken type) (chicken fixnum) (srfi 1) (srfi 63) vector-lib miscmacros (only (check-errors basic) define-check+error-type) levenshtein-operators) (define-type array (struct array)) (define-type levenshtein-operator (struct levenshtein-operator)) (define-type cost (not boolean)) (: levenshtein-distance/vector* (vector vector ;#!rest (list-of levenshtein-operator) #!key #:operations (list-of levenshtein-operator) #:eqlp (* * -> boolean) #:mult (cost cost -> cost) #:plus (cost cost -> cost) #:ltp (cost cost -> boolean) #:limit-cost (or false cost) -> (or false cost) (or false array))) ;; (define-check+error-type vector) (define-check+error-type procedure) ;; ;moremacros (define-syntax swap! (syntax-rules () ((swap! ?a ?b) (let ((_tmp ?a)) (set! ?a ?b) (set! ?b _tmp)) ) ) ) ;; (define (vector-relop op vec) (let ((len (vector-length vec))) (if (zero? len) (values #f #f) (let ((idx 0) (val (vector-ref vec 0)) ) (do ((i (the fixnum 1) (add1 i))) ((= i len) (values val idx)) (let ((nval (vector-ref vec i))) (unless (op val nval) (set! idx i) (set! val nval) ) ) ) ) ) ) ) ;; (define cost-positive-infinity most-positive-fixnum) (define (levenshtein/vector sv tv n m cv av lv eqlp perf finf plus mult ltp limit-cost) (let ((wrkvec (make-vector (add1 m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1)) ) (do ((k (the fixnum 0) (add1 k)) (cst (the fixnum 0) (plus cst inscst))) ((> k m)) (vector-set! wrkvec k cst) ) ; "Early" return is needed (let/cc return (let ((next #f)) (do ((i (the fixnum 0) (add1 i))) ((= i n) (finf next)) (let ((s-i (vector-ref sv i))) (do ((j (the fixnum 0) (add1 j)) (cur (mult delcst (add1 i)) next)) ((= j m) (vector-set! wrkvec m next)) (let ((apply-oper (lambda (eo-i cost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i))) (if (zero? a) ; A L (if (zero? l) ; - - (plus cost (vector-ref wrkvec j)) ; 0 0 (plus cost (vector-ref wrkvec (add1 j)))) ; 0 1 (if (zero? l) ; (plus cost cur) ; 1 0 (let ((cost@ (vector-ref wrkvec j))) ; (if (eqlp s-i (vector-ref tv j)) ; 1 1 cost@ (plus cost cost@))))))))) (let-values (((cost index) (vector-relop ltp (vector-map apply-oper cv)))) ; Quit when past limit (when (and limit-cost (ltp limit-cost cost)) (call-with-values (cut finf #f) return)) (perf i j index cost) (set! next cost) (vector-set! wrkvec j cur) ) ) ) ) ) ) ) ) ) ;; (define (levenshtein/matrix sv tv n m cv av lv eqlp perf finf plus mult ltp limit-cost) (let ((mat (make-array '#() (add1 n) (add1 m))) (inscst (vector-ref cv 0)) (delcst (vector-ref cv 1))) (do ((j (the fixnum 0) (add1 j)) (cst (the fixnum 0) (plus cst inscst))) ((> j m)) (array-set! mat cst 0 j) ) ; "Early" return is needed (let/cc return (do ((i (the fixnum 1) (add1 i))) ((> i n) (finf (array-ref mat n m))) (array-set! mat (mult i delcst) i 0) (let* ((i-1 (sub1 i)) (s-i (vector-ref sv i-1)) ) (do ((j (the fixnum 1) (add1 j))) ((> j m)) (let* ((j-1 (sub1 j)) (t-j (vector-ref tv j-1)) ) (let ((apply-oper (lambda (eo-i opercost) (let ((a (vector-ref av eo-i)) (l (vector-ref lv eo-i)) ) ; Must be within bounds of matrix (if (and (>= i a) (>= j l)) (let ((currcost (array-ref mat (- i a) (- j l)))) ; Allow a no-op cost <> 0 (cond ((and (zero? a) (zero? l)) (plus currcost opercost)) ; Special case w/ test, simplified ((and (= a (the fixnum 1)) (= l (the fixnum 1))) (if (eqlp s-i t-j) currcost (plus currcost opercost))) ; General case w/ test ((or (> a (the fixnum 1)) (> l (the fixnum 1))) (let ((x (plus currcost opercost))) (unless (eqlp (vector-ref sv (- i a)) t-j) (set! x (plus x opercost))) (unless (eqlp s-i (vector-ref tv (- j l))) (set! x (plus x opercost))) x)) ; Otherwise a = 0|1 & l = 1|0 (else (plus currcost opercost) ) ) ) ; Does this make sense when operation would violate mat bounds? ; Also out of fixnum range. +inf.0))))) (let-values (((cost index) (vector-relop ltp (vector-map apply-oper cv)))) ; Quit when past limit (when (and limit-cost (ltp limit-cost cost)) (call-with-values (cut finf #f) return)) ; Performed operation matrix is 0-based (perf i-1 j-1 index cost) (array-set! mat cost i j) ) ) ) ) ) ) ) ) ) ;;; (define (levenshtein-distance/vector* srcvec trgvec #!rest operlist #!key operations (eqlp char=?) (mult fx*) (plus fx+) (ltp fx<) (limit-cost (the (or false cost) #f))) (check-vector 'levenshtein-distance/vector* srcvec "source") (check-vector 'levenshtein-distance/vector* trgvec "target") (check-procedure 'levenshtein-distance/vector* eqlp "eqlp") (check-procedure 'levenshtein-distance/vector* mult "mult") (check-procedure 'levenshtein-distance/vector* plus "plus") (check-procedure 'levenshtein-distance/vector* ltp "ltp") ; Note that the edit-distance procedures ; return via the (finf) procedure. (let* ((operlist (filter levenshtein-operator? operlist)) ;only opers (opervec (levenshtein-extended-operators operlist)) ;Insert must be 1st, delete 2nd (insopr (vector-ref opervec 0)) ) ; Setup for no operation introspection (let ((srclen (vector-length srcvec)) (trglen (vector-length trgvec)) (perf void) ; Perform operation accumulate (finf ; Finish, "identity" (lambda (cost) (values cost #f))) (zrtf ; Zero-length, assume 'finf' binding to "identity" (lambda (len) (values (mult len (levenshtein-operator-cost insopr)) #f))) ) ; Use shorter as the target (when (< srclen trglen) (swap! srclen trglen) (swap! srcvec trgvec)) ; Setup callback & return procedures for performed operations matrix ; should caller want operation introspection (when operations (if (and (zero? srclen) (zero? trglen)) ;then degenerate case (set! zrtf (lambda (x) (values 0 (make-array '#() 0 0)))) ;else source or target non-empty (let ((pm #f) (pm-rows (max 1 srclen)) (pm-cols (max 1 trglen))) (set! pm (make-array '#() pm-rows pm-cols)) (set! perf (lambda (i j ovi cost) (array-set! pm (cons cost (vector-ref opervec ovi)) i j))) (set! finf (lambda (cost) (values cost pm))) (set! zrtf (lambda (len) (let ((io-c (levenshtein-operator-cost insopr))) (do ((i (the fixnum 0) (add1 i)) (i-cost io-c (plus i-cost io-c))) ((= i pm-rows)) (do ((j (the fixnum 0) (add1 j)) (j-cost i-cost (plus j-cost io-c))) ((= j pm-cols)) (perf i j 0 j-cost))) (finf (mult io-c len)))))))) ; Unpack edit operation offsets (let ((cstvec (vector-map (lambda (i eo) (levenshtein-operator-cost eo)) opervec)) (abvvec (vector-map (lambda (i eo) (levenshtein-operator-above eo)) opervec)) (lftvec (vector-map (lambda (i eo) (levenshtein-operator-left eo)) opervec)) ) ; Handle empty source/target special case, then choose algorithm based ; on complexity of edit operations (cond ((zero? srclen) (zrtf trglen)) ((zero? trglen) (zrtf srclen)) ((or (null? operlist) (every levenshtein-base-operator? operlist)) (levenshtein/vector srcvec trgvec srclen trglen cstvec abvvec lftvec eqlp perf finf plus mult ltp limit-cost)) (else (levenshtein/matrix srcvec trgvec srclen trglen cstvec abvvec lftvec eqlp perf finf plus mult ltp limit-cost))) ) ) ) ) ) ;module levenshtein-vector