;;; -*- Hen -*- ;;;; levenshtein-sequence-functor.scm ;;;; Kon Lovett, May '06 ;;;; Kon Lovett, Apr 2012 ;; Issues ;; ;; - When insert-cost <> delete-cost is the initialization of the work-vector correct? ;; What about when we swap the source & target? ;; ;; - Don't have a "real" sequence abstraction, only recognizes vector, list, and string. ;; What about byte-vector, u8vector, ..., and stream? (include "levenshtein-cost-interface") (include "levenshtein-sequence-interface") (functor (levenshtein-sequence-functor (CO COST-OPER) (SO SEQUENCE-OPER)) (;export levenshtein-distance/sequence) (import scheme chicken CO SO) (use data-structures levenshtein-operators srfi-1 vector-lib srfi-63 miscmacros moremacros numeric-macros type-checks type-errors) ;;; (define (levenshtein-distance/sequence source target #!key (insert-cost 1) (delete-cost 1) (substitute-cost 1) (get-work-vector make-vector) (elm-eql eqv?) (limit-cost #f)) ; Validate (check-procedure 'levenshtein-distance/generic-sequence elm-eql "elm-eql") (check-procedure 'levenshtein-distance/generic-sequence get-work-vector "get-work-vector") ; (let ((source-length (sequence-length source)) (target-length (sequence-length target))) (cond ; Quit when source or target empty ((fx= 0 source-length) (cost-multiply target-length insert-cost)) ((fx= 0 target-length) (cost-multiply source-length insert-cost)) ; Otherwise need to calculate distance (else ; "Strip" common prefix & suffix (let ((prefix-length (sequence-prefix-length elm-eql source target)) (suffix-length (sequence-suffix-length elm-eql source target))) (let ((stripped-source-start prefix-length) (stripped-source-end (fx- source-length suffix-length)) (stripped-target-start prefix-length) (stripped-target-end (fx- target-length suffix-length))) (let ((stripped-source-length (fx- stripped-source-end stripped-source-start)) (stripped-target-length (fx- stripped-target-end stripped-target-start))) ; Prefix overlaps suffix? (unless (and (fx<= 0 stripped-source-length) (fx<= 0 stripped-target-length)) ; Use the longest match & revert to the full string otherwise (if (fx< prefix-length suffix-length) (begin (set! stripped-source-start 0) (set! stripped-target-start 0)) (begin (set! stripped-source-end source-length) (set! stripped-target-end target-length))) ; Re-calc stripped lengths (set! stripped-source-length (fx- stripped-source-end stripped-source-start)) (set! stripped-target-length (fx- stripped-target-end stripped-target-start))) (cond ; Stripped source or target empty? ((fx= 0 stripped-source-length) (cost-multiply stripped-target-length insert-cost)) ((fx= 0 stripped-target-length) (cost-multiply stripped-source-length insert-cost)) ; Otherwise need to calculate distance (else ; Perform distance calculation on "stripped" source & target (let ((source (subsequence/shared source stripped-source-start stripped-source-end)) (target (subsequence/shared target stripped-target-start stripped-target-end)) (source-length stripped-source-length) (target-length stripped-target-length)) ; Swap so target is the shorter of source & target (when (fx< source-length target-length) (swap-set! source-length target-length) (swap-set! source target)) ; Allocate matrix row/column work vector (let ((work (get-work-vector (fx++ target-length)))) ; Initialize work vector (do ((k 0 (fx++ k)) (cost 0 (cost-add cost insert-cost))) ((fx> k target-length)) (vector-set! work k cost)) ; "Early" return is needed (let/cc return ; Calculate edit "cost" (let ((total-cost #f) (cost-at-source delete-cost)) ; For each source element (sequence-for-each (lambda (source-index source-elm) ; Every element costs (let ((current-cost cost-at-source)) ; For each target element (sequence-for-each (lambda (target-index target-elm) ; Calculate cost to this position (set! total-cost (cost-minimum (cost-add insert-cost (vector-ref work (fx++ target-index))) (cost-add delete-cost current-cost) (let ((cost-at-target (vector-ref work target-index))) (if (elm-eql source-elm target-elm) cost-at-target (cost-add substitute-cost cost-at-target))))) ; Quit when past limit (when (and limit-cost (cost-less-than limit-cost total-cost)) (return limit-cost)) ; Save the cost to this point (vector-set! work target-index current-cost) (set! current-cost total-cost) ) target) ; Save total-cost at target (vector-set! work target-length total-cost) ) ; Bump to next source cost ; Assumes indexing from 0 to end (set! cost-at-source (cost-add cost-at-source delete-cost)) ) source) ; Result is the total cost of edit total-cost ) ) ) ) ) ) ) ) ) ) ) ) ) ) ;functor levenshtein-sequence-functor