;;;; levenshtein-sequence-functor.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, May '06 ;; Issues ;; ;; - When insert-cost <> delete-cost is the initialization of the work-vector correct? ;; What about when we swap the source & target? ;; ;; - Don't have a "real" sequence abstraction, only recognizes vector, list, and string. ;; What about byte-vector, u8vector, ..., and stream? (include "levenshtein-cost-interface") (include "levenshtein-sequence-interface") (functor (levenshtein-sequence-functor (CO COST-OPER) (SO SEQUENCE-OPER)) (;export levenshtein-distance/sequence) (import scheme (chicken base) (chicken type) (srfi 1) (srfi 63) vector-lib miscmacros (only (check-errors basic) define-check+error-type) levenshtein-operators CO SO) ;NOTE functors so ... (define-type cost *) (define-type sequence *) (: levenshtein-distance/sequence (sequence sequence #!key #:insert-cost cost #:delete-cost cost #:substitute-cost cost #:get-work-vector (fixnum -> vector) #:elm-eql (* * -> boolean) #:limit-cost cost -> (or false cost))) ;; (define-check+error-type procedure) ;; ;moremacros (define-syntax swap! (syntax-rules () ((swap! ?a ?b) (let ((_tmp ?a)) (set! ?a ?b) (set! ?b _tmp)) ) ) ) ;; (define (levenshtein-distance/sequence src trg #!key (insert-cost 1) (delete-cost 1) (substitute-cost 1) (get-work-vector make-vector) (elm-eql eqv?) (limit-cost cost-positive-infinity)) ; Validate (check-sequence 'levenshtein-distance/generic-sequence src) (check-sequence 'levenshtein-distance/generic-sequence trg) (check-procedure 'levenshtein-distance/generic-sequence elm-eql "elm-eql") (check-procedure 'levenshtein-distance/generic-sequence get-work-vector "get-work-vector") ; (let ((src-len (the fixnum (sequence-length src))) (trg-len (the fixnum (sequence-length trg))) ) ; Quit when source or target empty (cond ((zero? src-len) (cost-multiply (fixnum->cost trg-len) insert-cost)) ((zero? trg-len) (cost-multiply (fixnum->cost src-len) insert-cost)) ; Otherwise need to calculate distance (else ; "Strip" common prefix & suffix (let ((pre-len (the fixnum (sequence-prefix-length elm-eql src trg))) (suf-len (the fixnum (sequence-suffix-length elm-eql src trg))) ) (let ((com-src-st (the fixnum pre-len)) (com-src-ed (the fixnum (- src-len suf-len))) (com-trg-st (the fixnum pre-len)) (com-trg-ed (the fixnum (- trg-len suf-len))) ) (let ((com-src-len (the fixnum (- com-src-ed com-src-st))) (com-trg-len (the fixnum (- com-trg-ed com-trg-st))) ) ; Prefix overlaps suffix? (unless (and (<= 0 com-src-len) (<= 0 com-trg-len)) ; Use the longest match & revert to the full string otherwise (cond ((< pre-len suf-len) (set! com-src-st 0) (set! com-trg-st 0) ) (else (set! com-src-ed src-len) (set! com-trg-ed trg-len) ) ) ; Re-calc common lengths (set! com-src-len (- com-src-ed com-src-st)) (set! com-trg-len (- com-trg-ed com-trg-st))) ; Stripped source or target empty? (cond ((zero? com-src-len) (cost-multiply (fixnum->cost com-trg-len) insert-cost)) ((zero? com-trg-len) (cost-multiply (fixnum->cost com-src-len) insert-cost)) ; Otherwise need to calculate distance (else ; Perform distance calculation on "common" source & target (let ((src (subsequence/shared src com-src-st com-src-ed)) (trg (subsequence/shared trg com-trg-st com-trg-ed)) (src-len com-src-len) (trg-len com-trg-len)) ; Swap so target is the shorter of source & target (when (< src-len trg-len) (swap! src-len trg-len) (swap! src trg)) ; Allocate matrix row/column work vector (let ((work (get-work-vector (add1 trg-len)))) ; Initialize work vector (do ((k (the fixnum 0) (add1 k)) (cost cost-zero (cost-add cost insert-cost)) ) ((> k trg-len)) (vector-set! work k cost)) ; "Early" return is needed (let/cc return ; Calculate edit "cost" (let ((total-cost (the (or false cost) #f)) (cost-at-src delete-cost)) ; For each source element (sequence-for-each (lambda (src-idx src-elm) ; Every element costs (let ((current-cost cost-at-src)) ; For each target element (sequence-for-each (lambda (trg-idx trg-elm) ; Calculate cost to this position (let ((cost (cost-minimum (cost-minimum (cost-add insert-cost (vector-ref work (add1 trg-idx))) current-cost #; ;already delete-cost (cost-add delete-cost current-cost)) (let ((cost-at-trg (vector-ref work trg-idx))) (if (elm-eql src-elm trg-elm) cost-at-trg (cost-add substitute-cost cost-at-trg))))) ) (set! total-cost cost) ) ; Quit when past limit (when (cost-less-than limit-cost total-cost) (return #f)) ; Save the cost to this point (vector-set! work trg-idx current-cost) (set! current-cost total-cost) ) trg) ; Save total-cost at target (vector-set! work trg-len total-cost) ) ; Bump to next source cost ; Assumes indexing from 0 to end (set! cost-at-src (cost-add cost-at-src delete-cost)) ) src) ; Result is the total cost of edit total-cost ) ) ) ) ) ) ) ) ) ) ) ) ) ) ;functor levenshtein-sequence-functor