;;;; bloom-filter.scm ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'meessage-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Not optimal for 64-bit machines. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower all of the hashes are generated anyway. It ;; is up to the caller to optimize. ;; ;; - Use of 'iset' is slower. (module bloom-filter (;export optimum-size optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive bloom-filter-set! bloom-filter-exists?) (import scheme chicken) (use srfi-1 srfi-13 iset message-digest record-variants type-checks hash-utils) (declare (bound-to-procedure ##sys#signal-hook)) ;;; (define-constant LN2 0.69314718055994528622676398299518041312694549560546875) ;ln(2) (define-constant -LN2^2 -0.48045301391820138814381380143458954989910125732421875) ;-(ln(2)^2) (define-constant unsigned-integer32-size 4) ;;; Record Type (define-record-type-variant bloom-filter (unsafe unchecked inline) (%make-bloom-filter n m k bits hashes mdps) %bloom-filter? (n %bloom-filter-n %bloom-filter-n-set!) (m %bloom-filter-m) (k %bloom-filter-k) (bits %bloom-filter-bits %bloom-filter-bits-set!) (hashes %bloom-filter-hashers) (mdps %bloom-filter-algorithms) ) ;;; Support (define (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) (define (bloom-filter-indices bf obj) (fold (lambda (hasher ls) (hasher obj ls)) '() (%bloom-filter-hashers bf)) ) (define (bloom-filter-k-indices bf obj) (take! (bloom-filter-indices bf obj) (%bloom-filter-k bf)) ) (define (bloom-filter-fold bf func init obj) (fold func init (bloom-filter-k-indices bf obj)) ) (define-inline (message-digest-result->integers bv m u32cnt ls) (let loop ((idx 0) (ls ls)) (if (fx= idx u32cnt) ls ;Hash functions return integers, and m is a fixnum, so ;`inexact->exact' will produce a fixnum. (loop (fx+ idx 1) (cons (inexact->exact (remainder (unsigned-integer32-ref bv idx) m)) ls)) ) ) ) (define (make-bloom-filter-hasher mdp m) (let ((u32cnt (fx/ (message-digest-primitive-digest-length mdp) unsigned-integer32-size))) ;Returns a list of hash values for the supplied object (lambda (obj ls) (message-digest-result->integers (message-digest-object mdp obj 'blob) m u32cnt ls) ) ) ) ;;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound ;n : capacity, p : probability of false-positive ;=> m : bits, k : hashes (define (optimum-size p n) (let ((m (inexact->exact (ceiling (/ (* n (log p)) -LN2^2))))) (values m (inexact->exact (ceiling (/ (* m LN2) n)))) ) ) (define (optimum-k n m) (inexact->exact (ceiling (* LN2 (/ m n)))) ) (define (optimum-m k n) (inexact->exact (ceiling (/ (* n k) LN2))) ) ;Similar to above (define (p-random-one-bit k n m) (- 1 (expt (- 1 (/ 1 m)) (* k n))) ) (define (p-false-positive k n m) (expt (p-random-one-bit k n m) k) ) (define (desired-m p n . opt-k) (check-positive-fixnum 'desired-m n) (let loop ((m n)) (let* ((k (optional opt-k (optimum-k n m))) (calc-p (p-false-positive k n m))) (cond ((<= calc-p p) (values m k calc-p) ) ((fx< (fx- most-positive-fixnum m) n) (##sys#signal-hook #:bounds-error 'desired-m "cannot represent `m' as a fixnum" m n calc-p) ) (else ;FIXME the increment is too large for large n? (loop (fx+ m n)) ) ) ) ) ) (define (actual-k mdps) (fold (lambda (len tot) (fx+ tot (fx/ len unsigned-integer32-size))) 0 (message-digest-primitive-lengths mdps)) ) ;;; Bloom Filter (define (bloom-filter? obj) (%bloom-filter? obj) ) (define-check+error-type bloom-filter %bloom-filter?) (define (bloom-filter-algorithms bf) (check-bloom-filter 'bloom-filter-algorithms bf) (%bloom-filter-algorithms bf) ) (define (bloom-filter-n bf) (check-bloom-filter 'bloom-filter-n bf) (%bloom-filter-n bf) ) (define (bloom-filter-m bf) (check-bloom-filter 'bloom-filter-m bf) (%bloom-filter-m bf) ) (define (bloom-filter-k bf) (check-bloom-filter 'bloom-filter-k bf) (%bloom-filter-k bf) ) ;( p n mdps) | ( m mdps [k]) (define (make-bloom-filter m mdps #!optional des-k) ;Process paramter list variant (if (list? mdps) (check-positive-fixnum 'make-bloom-filter m 'm) (let ((p m) (n mdps)) (check-flonum 'make-bloom-filter p 'p) (check-open-interval 'make-bloom-filter p 0.0 1.0 'p) (check-positive-fixnum 'make-bloom-filter n 'n) (set! mdps des-k) (set!-values (m des-k) (optimum-size p n)) ) ) (check-list 'make-bloom-filter mdps 'mdps) (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) mdps) ;Get the "desired" # of hash values (k) (let ((act-k (actual-k mdps))) (if (not des-k) (set! des-k act-k) (begin (check-positive-fixnum 'make-bloom-filter des-k) (when (fx< act-k des-k) (error 'make-bloom-filter "insufficient hash functions" act-k des-k) ) ) ) ) ;Bloom filter is a multi-hash into a bitvector (%make-bloom-filter 0 m des-k (make-bit-vector m) (map (cut make-bloom-filter-hasher <> m) mdps) mdps) ) (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (bloom-filter-p-false-positive (%bloom-filter-k bf) (optional n (%bloom-filter-n bf)) (%bloom-filter-m bf)) ) (define (bloom-filter-set! bf obj) (check-bloom-filter 'bloom-filter-set! bf) (%bloom-filter-bits-set! bf (bloom-filter-fold bf (lambda (idx bits) (bit-vector-set! bits idx #t)) (%bloom-filter-bits bf) obj)) (%bloom-filter-n-set! bf (fx+ (%bloom-filter-n bf) 1)) #;(void) ) (define (bloom-filter-exists? bf obj) (check-bloom-filter 'bloom-filter-exists? bf) (let ((bits (%bloom-filter-bits bf))) (fx= (%bloom-filter-k bf) (bloom-filter-fold bf (lambda (idx cnt) (if (bit-vector-ref bits idx) (fx+ cnt 1) cnt)) 0 obj)) ) ) ) ;module bloom-filter