;;;; levenshtein-sequence-functor.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Apr '12 ;;;; Kon Lovett, May '06 ;; Issues ;; ;; - When insert-cost <> delete-cost is the initialization of the work-vector correct? ;; What about when we swap the source & target? ;; ;; - Don't have a "real" sequence abstraction, only recognizes vector, list, and string. ;; What about byte-vector, u8vector, ..., and stream? (include "levenshtein-cost-interface") (include "levenshtein-sequence-interface") (functor (levenshtein-sequence-functor (CO COST-OPER) (SO SEQUENCE-OPER)) (;export levenshtein-distance/sequence) (import scheme (chicken base) (chicken type) (srfi 1) (srfi 63) vector-lib miscmacros type-checks type-errors levenshtein-operators CO SO) ;;; ;moremacros (define-syntax swap! (syntax-rules () ((swap! ?a ?b) (let ((_tmp ?a)) (set! ?a ?b) (set! ?b _tmp)) ) ) ) (define (levenshtein-distance/sequence source target #!key (insert-cost 1) (delete-cost 1) (substitute-cost 1) (get-work-vector make-vector) (elm-eql eqv?) (limit-cost #f)) ; Validate (check-sequence 'levenshtein-distance/generic-sequence source) (check-sequence 'levenshtein-distance/generic-sequence target) (check-procedure 'levenshtein-distance/generic-sequence elm-eql "elm-eql") (check-procedure 'levenshtein-distance/generic-sequence get-work-vector "get-work-vector") ; (let ((source-length (the fixnum (sequence-length source))) (target-length (the fixnum (sequence-length target))) ) (cond ; Quit when source or target empty ((zero? source-length) (cost-multiply (fixnum->cost target-length) insert-cost)) ((zero? target-length) (cost-multiply (fixnum->cost source-length) insert-cost)) ; Otherwise need to calculate distance (else ; "Strip" common prefix & suffix (let ((prefix-length (the fixnum (sequence-prefix-length elm-eql source target))) (suffix-length (the fixnum (sequence-suffix-length elm-eql source target))) ) (let ((stripped-source-start (the fixnum prefix-length)) (stripped-source-end (the fixnum (- source-length suffix-length))) (stripped-target-start (the fixnum prefix-length)) (stripped-target-end (the fixnum (- target-length suffix-length))) ) (let ((stripped-source-length (the fixnum (- stripped-source-end stripped-source-start))) (stripped-target-length (the fixnum (- stripped-target-end stripped-target-start)))) ; Prefix overlaps suffix? (unless (and (<= 0 stripped-source-length) (<= 0 stripped-target-length)) ; Use the longest match & revert to the full string otherwise (if (< prefix-length suffix-length) (begin (set! stripped-source-start 0) (set! stripped-target-start 0)) (begin (set! stripped-source-end source-length) (set! stripped-target-end target-length))) ; Re-calc stripped lengths (set! stripped-source-length (- stripped-source-end stripped-source-start)) (set! stripped-target-length (- stripped-target-end stripped-target-start))) (cond ; Stripped source or target empty? ((zero? stripped-source-length) (cost-multiply (fixnum->cost stripped-target-length) insert-cost)) ((zero? stripped-target-length) (cost-multiply (fixnum->cost stripped-source-length) insert-cost)) ; Otherwise need to calculate distance (else ; Perform distance calculation on "stripped" source & target (let ((source (subsequence/shared source stripped-source-start stripped-source-end)) (target (subsequence/shared target stripped-target-start stripped-target-end)) (source-length stripped-source-length) (target-length stripped-target-length)) ; Swap so target is the shorter of source & target (when (< source-length target-length) (swap! source-length target-length) (swap! source target)) ; Allocate matrix row/column work vector (let ((work (get-work-vector (add1 target-length)))) ; Initialize work vector (do ((k (the fixnum 0) (add1 k)) (cost cost-zero (cost-add cost insert-cost)) ) ((> k target-length)) (vector-set! work k cost)) ; "Early" return is needed (let/cc return ; Calculate edit "cost" (let ((total-cost (the * #f)) (cost-at-source delete-cost)) ; For each source element (sequence-for-each (lambda (source-index source-elm) ; Every element costs (let ((current-cost cost-at-source)) ; For each target element (sequence-for-each (lambda (target-index target-elm) ; Calculate cost to this position (set! total-cost (cost-minimum (cost-minimum (cost-add insert-cost (vector-ref work (add1 target-index))) (cost-add delete-cost current-cost)) (let ((cost-at-target (vector-ref work target-index))) (if (elm-eql source-elm target-elm) cost-at-target (cost-add substitute-cost cost-at-target))))) ; Quit when past limit (when (and limit-cost (cost-less-than limit-cost total-cost)) (return limit-cost)) ; Save the cost to this point (vector-set! work target-index current-cost) (set! current-cost total-cost) ) target) ; Save total-cost at target (vector-set! work target-length total-cost) ) ; Bump to next source cost ; Assumes indexing from 0 to end (set! cost-at-source (cost-add cost-at-source delete-cost)) ) source) ; Result is the total cost of edit total-cost ) ) ) ) ) ) ) ) ) ) ) ) ) ) ;functor levenshtein-sequence-functor