;;;; bloom-filter.scm -*- Scheme -*- ;;;; Kon Lovett, Jul '18 ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'message-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower all of the hashes are generated anyway. It ;; is up to the caller to optimize. ;; ;; - Use of 'iset' is slower than bit-vector. (module bloom-filter (;export ; optimum-size optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k ; make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive bloom-filter-set! bloom-filter-exists?) (import scheme (chicken base) (chicken fixnum) (chicken flonum) (chicken type) (chicken memory) (only (srfi 1) list-copy take! reverse!) iset message-digest-primitive message-digest-type message-digest-item record-variants (only type-checks define-check+error-type check-positive-fixnum check-flonum check-open-interval check-list) (only type-errors-basic signal-bounds-error)) (declare (bound-to-procedure ##sys#signal-hook)) ;;; (define-type message-digest-primitive (struct message-digest-primitive)) (define-type message-digest-primitives (list-of message-digest-primitive)) (define-type bloom-filter-hasher (* (list-of fixnum) -> (list-of fixnum))) ;; (define-constant LN2 0.69314718055994528622676398299518041312694549560546875) ;ln(2) (define-constant -LN2^2 -0.48045301391820138814381380143458954989910125732421875) ;-(ln(2)^2) ;; (define (machine-word-size) (cond-expand (64bit 8 ) (else 4 ) ) ) (define (object-data-pointer obj) ;skip over the machine-word header (pointer+ (object->pointer obj) (machine-word-size)) ) (cond-expand (64bit (define (unsigned-integer64-ref obj idx) (pointer-u64-ref (pointer+ (object-data-pointer obj) idx)) ) ) (else (define (unsigned-integer32-ref obj idx) (pointer-u32-ref (pointer+ (object-data-pointer obj) idx)) ) ) ) (: unsigned-native-integer-ref (* fixnum --> (or fixnum bignum))) ; (define (unsigned-native-integer-ref obj idx) (cond-expand (64bit (unsigned-integer64-ref obj idx) ) (else (unsigned-integer32-ref obj idx) ) ) ) ;;; Record Type (define-type bloom-filter (struct bloom-filter)) (define-record-type-variant bloom-filter (unsafe unchecked inline) (%make-bloom-filter n m k bits hashes mdps) %bloom-filter? (n %bloom-filter-n %bloom-filter-n-set!) (m %bloom-filter-m) (k %bloom-filter-k) (bits %bloom-filter-bits %bloom-filter-bits-set!) (hashes %bloom-filter-hashers) (mdps %bloom-filter-algorithms) ) ;;; Support ;; (: message-digest-primitive-lengths (message-digest-primitives --> (list-of fixnum))) ; (define (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) (: bloom-filter-indices (bloom-filter * --> list)) ; (define (bloom-filter-indices bf obj) (foldl (lambda (ls hasher) (hasher obj ls)) '() (%bloom-filter-hashers bf)) ) (: bloom-filter-k-indices (bloom-filter * --> list)) ; (define (bloom-filter-k-indices bf obj) (take! (bloom-filter-indices bf obj) (%bloom-filter-k bf)) ) (: bloom-filter-foldl (bloom-filter procedure * list --> *)) ; (define (bloom-filter-foldl bf func init obj) (foldl func init (bloom-filter-k-indices bf obj)) ) (: message-digest-result->integers (* fixnum fixnum fixnum (list-of fixnum) -> (list-of fixnum))) ; (define (message-digest-result->integers obj m wrdcnt bytrem ls) ; (define (words) (let loop ((idx 0) (ls ls)) (if (fx= idx wrdcnt) ls (let* ( (num (unsigned-native-integer-ref obj idx)) (int (inexact->exact (fpfloor (exact->inexact (remainder num m))))) ) (loop (fx+ idx 1) (cons int ls)) ) ) ) ) ; (let* ( (ptr (object->pointer obj)) (bytoff (fx* wrdcnt (machine-word-size))) (ptr (pointer+ ptr bytoff)) ) (do ( (cnt bytrem (fx- cnt 1)) (ptr ptr (pointer+ ptr 1)) (int 0 (fx+ int (pointer-u8-ref ptr))) ) ((fx= 0 cnt) (reverse! (cons int (words))) ) ) ) ) (: make-bloom-filter-hasher (message-digest-primitive fixnum -> bloom-filter-hasher)) ; (define (make-bloom-filter-hasher mdp m) (let ( (len (message-digest-primitive-digest-length mdp)) (siz (machine-word-size)) ) (let ( (wrdcnt (fx/ len siz) ) (bytrem (fxmod len siz) ) ) ;returns a list of hash values for the supplied object (lambda (obj ls) (let ( (blb (message-digest-object mdp obj 'blob)) ) (message-digest-result->integers blb m wrdcnt bytrem ls) ) ) ) ) ) ;;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound (: optimum-size (float fixnum --> fixnum fixnum)) ; ;n : capacity, p : probability of false-positive ;=> m : bits, k : hashes (define (optimum-size p n) (let* ( (nx (exact->inexact n)) (mx (fpceiling (fp/ (fp* nx (fplog p)) -LN2^2))) (kx (fpceiling (fp/ (fp* mx LN2) nx))) ) (values (inexact->exact mx) (inexact->exact kx)) ) ) (: optimum-k (fixnum fixnum --> fixnum)) ; (define (optimum-k n m) (let ( (nx (exact->inexact n)) (mx (exact->inexact m)) ) (inexact->exact (fpceiling (fp* LN2 (fp/ mx nx)))) ) ) (: optimum-m (fixnum fixnum --> fixnum)) ; (define (optimum-m k n) (let ( (kx (exact->inexact k)) (nx (exact->inexact n)) ) (inexact->exact (fpceiling (fp/ (fp* nx kx) LN2))) ) ) (: p-random-one-bit (fixnum fixnum fixnum --> float)) ; (define (p-random-one-bit k n m) (let ( (kx (exact->inexact k)) (nx (exact->inexact n)) (mx (exact->inexact m)) ) (fp- 1.0 (fpexpt (fp- 1.0 (fp/ 1.0 mx)) (fp* kx nx))) ) ) (: p-false-positive (fixnum fixnum fixnum --> float)) ; (define (p-false-positive k n m) (let ( (kx (exact->inexact k)) ) (fpexpt (p-random-one-bit k n m) kx) ) ) (: desired-m (float fixnum #!optional fixnum --> fixnum fixnum float)) ; (define (desired-m p n #!optional opt-k) (check-flonum 'desired-m p 'p) (let ( (opt-k (and opt-k (check-positive-fixnum 'desired-m opt-k 'optimal-k))) ) (let loop ((m (check-positive-fixnum 'desired-m n 'n))) (let* ( (k (or opt-k (optimum-k n m))) (calc-p (p-false-positive k n m)) ) (cond ((fp<= calc-p p) (values m k calc-p) ) ((fx< (fx- most-positive-fixnum m) n) (signal-bounds-error 'desired-m "cannot represent `m' as a fixnum" m n calc-p) ) (else ;FIXME the increment is too large for large n ? (loop (fx+ m n)) ) ) ) ) ) ) (: actual-k (message-digest-primitives --> fixnum)) ; (define (actual-k mdps) (let ( (siz (machine-word-size)) ) (foldl (lambda (tot len) (fx+ tot (fx/ len siz))) 0 (message-digest-primitive-lengths mdps)) ) ) ;;; Bloom Filter (: bloom-filter? (* -> boolean : bloom-filter)) ; (define (bloom-filter? obj) (%bloom-filter? obj) ) (define-check+error-type bloom-filter %bloom-filter?) (: bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) ; (define (bloom-filter-algorithms bf) (list-copy (%bloom-filter-algorithms (check-bloom-filter 'bloom-filter-algorithms bf))) ) (: bloom-filter-n (bloom-filter --> fixnum)) ; (define (bloom-filter-n bf) (%bloom-filter-n (check-bloom-filter 'bloom-filter-n bf)) ) (: bloom-filter-m (bloom-filter --> fixnum)) ; (define (bloom-filter-m bf) (%bloom-filter-m (check-bloom-filter 'bloom-filter-m bf)) ) (: bloom-filter-k (bloom-filter --> fixnum)) ; (define (bloom-filter-k bf) (%bloom-filter-k (check-bloom-filter 'bloom-filter-k bf)) ) ;FIXME make-bloom-filter type is ugh ;( p n mdps) | ( m mdps [k]) (: make-bloom-filter ((or fixnum float) (or fixnum message-digest-primitives) #!optional (or fixnum message-digest-primitives) -> bloom-filter)) ; (define (make-bloom-filter m mdps #!optional des-k) ;processing ( m mdps [k] ) or ( p n mdps ) ? (if (list? mdps) (check-positive-fixnum 'make-bloom-filter m 'm) (let ( (p (check-flonum 'make-bloom-filter m 'p)) (n (check-positive-fixnum 'make-bloom-filter mdps 'n)) ) (check-open-interval 'make-bloom-filter p 0.0 1.0 'p) (set! mdps des-k) (set!-values (m des-k) (optimum-size p n)) ) ) ;algorithms (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) (check-list 'make-bloom-filter mdps 'mdps)) ;get the "desired" # of hash values (k) (let ( (act-k (actual-k mdps)) ) (if (not des-k) (set! des-k act-k) (when (fx< act-k (check-positive-fixnum 'make-bloom-filter des-k)) ;FIXME tell them how ! (error 'make-bloom-filter "insufficient hash functions supplied" act-k des-k) ) ) ) ;bloom filter is a multi-hash into a bitvector (let ( (bits (make-bit-vector m)) (hashers (map (cut make-bloom-filter-hasher <> m) mdps)) ) (%make-bloom-filter 0 m des-k bits hashers mdps) ) ) (: bloom-filter-p-false-positive (bloom-filter --> float)) ; (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (p-false-positive (%bloom-filter-k bf) (optional n (%bloom-filter-n bf)) (%bloom-filter-m bf)) ) (: bloom-filter-set! (bloom-filter * -> void)) ; (define (bloom-filter-set! bf obj) (unless (bloom-filter-exists? (check-bloom-filter 'bloom-filter-set! bf) obj) (let ( (bits (bloom-filter-foldl bf (lambda (bits idx) (bit-vector-set! bits idx #t) bits ) (%bloom-filter-bits bf) obj)) ) (%bloom-filter-bits-set! bf bits) ) (%bloom-filter-n-set! bf (fx+ (%bloom-filter-n bf) 1)) ) ) (: bloom-filter-exists? (bloom-filter * --> boolean)) ; (define (bloom-filter-exists? bf obj) (let* ( (bits (%bloom-filter-bits (check-bloom-filter 'bloom-filter-exists? bf))) (refs (bloom-filter-foldl bf (lambda (cnt idx) (if (bit-vector-ref bits idx) (fx+ cnt 1) cnt ) ) 0 obj)) ) (fx= (%bloom-filter-k bf) refs) ) ) ) ;module bloom-filter