;;;; bloom-filter.scm ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'meessage-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Not optimal for 64-bit machines. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower the actual number of hash functions ;; are generated anyway. It is up to the caller to optimize. ;; ;; - Use of 'iset' is slower. (module bloom-filter (;export optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive bloom-filter-set! bloom-filter-exists?) (import scheme chicken) (use srfi-1 srfi-13 numeric-macros moremacros iset message-digest record-variants type-checks hash-utils) ;;; (include "mathh-constants") (define-constant unsigned-integer32-size 4) ;;; Record Type (define-record-type-variant bloom-filter (unsafe unchecked inline) (%make-bloom-filter n m k bits hashes mdps) %bloom-filter? (n %bloom-filter-n %bloom-filter-n-set!) (m %bloom-filter-m) (k %bloom-filter-k) (bits %bloom-filter-bits %bloom-filter-bits-set!) (hashes %bloom-filter-hashes) (mdps %bloom-filter-algorithms) ) ;;; Support (define-syntax ensure-positive-fixnum (syntax-rules () ((_ ?loc ?n) (set! ?n (inexact->exact (check-positive-fixnum ?loc ?n))) ) ) ) (define-inline (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) (define-inline (bloom-filter-indices bf obj) (map (cut <> obj) (%bloom-filter-hashes bf)) ) (define-inline (bloom-filter-k-indices bf obj) (take! (apply append! (bloom-filter-indices bf obj)) (%bloom-filter-k bf)) ) (define-inline (message-digest->integers str m) (let* ((bytsiz (##sys#size str)) (u32len (fx/ bytsiz unsigned-integer32-size)) ) (let loop ((idx 0) (ls '())) (if (fx= idx u32len) ls ; Hash functions return integers, ; and m is a fixnum, so 'inexact->exact' will produce a fixnum. (loop (fx++ idx) (cons (inexact->exact (modulo (unsigned-integer32-ref str idx) m)) ls)) ) ) ) ) (define-inline (bloom-filter-fold bf proc init obj) (fold proc init (bloom-filter-k-indices bf obj)) ) ;;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound, but w/ rounding (define (optimum-k n m) (inexact->exact (round (* LN2 (/ m n)))) ) (define (optimum-m k n) (inexact->exact (round (/ (* n k) LN2))) ) ; Similar to above (define (p-random-one-bit k n m) (- 1 (expt (- 1 (/ 1 m)) (* k n))) ) (define (p-false-positive k n m) (expt (p-random-one-bit k n m) k) ) (define (desired-m p n . opt-k) (ensure-positive-fixnum 'desired-m n) (let loop ((m n)) (let* ((k (optional opt-k (optimum-k n m))) (calc-p (p-false-positive k n m))) (if (<= calc-p p) (values m k calc-p) (loop (fx+ m n))))) ) (define (actual-k mdps) (fold (lambda (len tot) (fx+ tot (fx/ len unsigned-integer32-size))) 0 (message-digest-primitive-lengths mdps)) ) ;;; Bloom Filter (define (bloom-filter? obj) (%bloom-filter? obj) ) (define-check+error-type bloom-filter) (define (bloom-filter-algorithms bf) (check-bloom-filter 'bloom-filter-algorithms bf) (%bloom-filter-algorithms bf) ) (define (bloom-filter-n bf) (check-bloom-filter 'bloom-filter-n bf) (%bloom-filter-n bf) ) (define (bloom-filter-m bf) (check-bloom-filter 'bloom-filter-m bf) (%bloom-filter-m bf) ) (define (bloom-filter-k bf) (check-bloom-filter 'bloom-filter-k bf) (%bloom-filter-k bf) ) (define (make-bloom-filter m mdps #!optional des-k) (ensure-positive-fixnum 'make-bloom-filter m) (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) mdps) (let ((make-mdp (lambda (mdp) ; Returns a list of hash values for the supplied object (lambda (obj) (message-digest->integers (message-digest-object mdp obj 'string) m) ) ) ) ) ; Get the "desired" # of hash values (k) (let ((act-k (actual-k mdps))) (if (not des-k) (set! des-k act-k) (begin (ensure-positive-fixnum 'make-bloom-filter des-k) (when (fx< act-k des-k) (error 'make-bloom-filter "insufficient hash functions" act-k des-k) ) ) ) ) (%make-bloom-filter 0 m des-k (make-bit-vector m) (map make-mdp mdps) mdps)) ) (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (bloom-filter-p-false-positive (%bloom-filter-k bf) (optional n (%bloom-filter-n bf)) (%bloom-filter-m bf)) ) (define (bloom-filter-set! bf obj) (check-bloom-filter 'bloom-filter-set! bf) (%bloom-filter-bits-set! bf (bloom-filter-fold bf (lambda (idx bits) (bit-vector-set! bits idx #t)) (%bloom-filter-bits bf) obj)) (%bloom-filter-n-set! bf (fx++ (%bloom-filter-n bf))) (void) ) (define (bloom-filter-exists? bf obj) (check-bloom-filter 'bloom-filter-exists? bf) (let ((bits (%bloom-filter-bits bf))) (fx= (%bloom-filter-k bf) (bloom-filter-fold bf (lambda (idx cnt) (if (bit-vector-ref bits idx) (fx++ cnt) cnt)) 0 obj)) ) ) ) ;module bloom-filter