;;;; levenshtein-sequence-functor.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, May '06 ;; Issues ;; ;; - When insert-cost <> delete-cost is the initialization of the work-vector correct? ;; What about when we swap the source & target? ;; ;; - Don't have a "real" sequence abstraction, only recognizes vector, list, and string. ;; What about byte-vector, u8vector, ..., and stream? (include-relative "levenshtein-cost-interface") (include-relative "levenshtein-sequence-interface") (functor (levenshtein-sequence-functor (CO COST-OPER) (SO SEQUENCE-OPER)) (;export levenshtein-distance/sequence) (import scheme (chicken base) (chicken type) (chicken fixnum) (srfi 1) (srfi 63) vector-lib miscmacros (only (check-errors basic) define-check+error-type) levenshtein-operators CO SO) ;NOTE functors so ... (define-type cost *) (define-type sequence *) (: levenshtein-distance/sequence (sequence sequence #!key #:insert-cost cost #:delete-cost cost #:substitute-cost cost #:get-work-vector (fixnum -> vector) #:elm-eql (* * -> boolean) #:limit-cost (or false cost) -> (or false cost))) ;; (define-check+error-type procedure) ;; ;moremacros (define-syntax swap! (syntax-rules () ((swap! ?a ?b) (let ((_tmp ?a)) (set! ?a ?b) (set! ?b _tmp)) ) ) ) ;fx-inlines (define-inline (fxzero? x) (fx= 0 x)) (define-inline (fxadd1 x) (fx+ x 1)) ;; (define (levenshtein-distance/sequence src trg #!key (insert-cost 1) (delete-cost 1) (substitute-cost 1) (get-work-vector make-vector) (elm-eql eqv?) (limit-cost (the (or false cost) #f))) ; Validate (check-sequence 'levenshtein-distance/generic-sequence src) (check-sequence 'levenshtein-distance/generic-sequence trg) (check-procedure 'levenshtein-distance/generic-sequence elm-eql "elm-eql") (check-procedure 'levenshtein-distance/generic-sequence get-work-vector "get-work-vector") ; (let ((src-len (the fixnum (sequence-length src))) (trg-len (the fixnum (sequence-length trg))) ) ; Quit when source or target empty (cond ((fxzero? src-len) (cost-multiply (fixnum->cost trg-len) insert-cost)) ((fxzero? trg-len) (cost-multiply (fixnum->cost src-len) insert-cost)) ; Otherwise need to calculate distance (else ; "Strip" common prefix & suffix (let ((pre-len (the fixnum (sequence-prefix-length elm-eql src trg))) (suf-len (the fixnum (sequence-suffix-length elm-eql src trg))) ) (let ((com-src-st (the fixnum pre-len)) (com-src-ed (the fixnum (fx- src-len suf-len))) (com-trg-st (the fixnum pre-len)) (com-trg-ed (the fixnum (fx- trg-len suf-len))) ) (let ((com-src-len (the fixnum (fx- com-src-ed com-src-st))) (com-trg-len (the fixnum (fx- com-trg-ed com-trg-st))) ) ; Prefix overlaps suffix? (unless (and (fx<= 0 com-src-len) (fx<= 0 com-trg-len)) ; Use the longest match & revert to the full string otherwise (cond ((fx< pre-len suf-len) (set! com-src-st 0) (set! com-trg-st 0) ) (else (set! com-src-ed src-len) (set! com-trg-ed trg-len) ) ) ; Re-calc common lengths (set! com-src-len (fx- com-src-ed com-src-st)) (set! com-trg-len (fx- com-trg-ed com-trg-st)) ) ; Stripped source or target empty? (cond ((fxzero? com-src-len) (cost-multiply (fixnum->cost com-trg-len) insert-cost)) ((fxzero? com-trg-len) (cost-multiply (fixnum->cost com-src-len) insert-cost)) ; Otherwise need to calculate distance (else ; Perform distance calculation on "common" source & target (let ((src (subsequence/shared src com-src-st com-src-ed)) (trg (subsequence/shared trg com-trg-st com-trg-ed)) (src-len com-src-len) (trg-len com-trg-len)) ; Swap so target is the shorter of source & target (when (fx< src-len trg-len) (swap! src-len trg-len) (swap! src trg)) ; Allocate matrix row/column work vector (let ((work (get-work-vector (fxadd1 trg-len)))) ; Initialize work vector (do ((k (the fixnum 0) (fxadd1 k)) (cost cost-zero (cost-add cost insert-cost)) ) ((fx> k trg-len)) (vector-set! work k cost)) ; "Early" return is needed (let/cc return ; Calculate edit "cost" (let ((total-cost (the (or false cost) #f)) (cost-at-src delete-cost)) ; For each source element (sequence-for-each (lambda (src-idx src-elm) ; Every element costs (let ((current-cost cost-at-src)) ; For each target element (sequence-for-each (lambda (trg-idx trg-elm) ; Calculate cost to this position (set! total-cost (cost-minimum (cost-minimum (cost-add insert-cost (vector-ref work (fxadd1 trg-idx))) (cost-add delete-cost current-cost)) (let ((cost-at-trg (vector-ref work trg-idx))) (if (elm-eql src-elm trg-elm) cost-at-trg (cost-add substitute-cost cost-at-trg))))) ; Quit when past limit (when (and limit-cost (cost-less-than limit-cost total-cost)) (return #f)) ; Save the cost to this point (vector-set! work trg-idx current-cost) (set! current-cost total-cost) ) trg) ; Save total-cost at target (vector-set! work trg-len total-cost) ) ; Bump to next source cost ; Assumes indexing from 0 to end (set! cost-at-src (cost-add cost-at-src delete-cost)) ) src) ; Result is the total cost of edit total-cost ) ) ) ) ) ) ) ) ) ) ) ) ) ) ;functor levenshtein-sequence-functor