;;;; bloom-filter.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Jul '18 ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'message-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower all of the hashes are generated anyway. It ;; is up to the caller to optimize. ;; ;; - Use of 'iset' is slower than bit-vector. (module bloom-filter (;export ; optimum-size optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k ; make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive bloom-filter-set! bloom-filter-exists?) (import scheme) (import (chicken base)) (import (only (chicken fixnum) most-positive-fixnum)) (import (chicken type)) (import (chicken memory)) (import (only (srfi 1) list-copy take! reverse!)) (import iset) (import message-digest-primitive) (import message-digest-type) (import message-digest-item) (import (only type-checks define-check+error-type check-positive-fixnum check-flonum check-open-interval check-list)) (import (only type-errors-basic signal-bounds-error)) ;;; ;FIXME should be able to get type as module export (define-type iset:integer-set *) (define-type boolean-set iset:integer-set) (define-type message-digest-primitive (struct message-digest-primitive)) (define-type message-digest-primitives (list-of message-digest-primitive)) (define-type bloom-filter-hasher (* (list-of fixnum) -> (list-of fixnum))) (define-type bloom-filter-hashers (list-of bloom-filter-hasher)) (define-type unsigned-native-integer-getter (* fixnum --> (or fixnum bignum))) ;; ;mathh (define-constant LN2 0.69314718055994528622676398299518041312694549560546875) ;ln(2) (define-constant -LN2^2 -0.48045301391820138814381380143458954989910125732421875) ;-(ln(2)^2) ;; ; (define-constant MACHINE-WORD-SIZE (cond-expand (64bit 8) (else 4))) (define-inline (object-data-pointer obj) ;skip over the machine-word header (pointer+ (object->pointer obj) MACHINE-WORD-SIZE) ) (define-inline (pointer-word-offset ptr idx) (assume ((idx fixnum)) (pointer+ ptr (* idx MACHINE-WORD-SIZE)) ) ) (define-inline (object-data-offset obj idx) (pointer-word-offset (object-data-pointer obj) idx) ) ;NOTE ((cond-expand (64bit pointer-u64-ref) (else ...) (object-data-offset ...))) Fails! (: wordvector-ref unsigned-native-integer-getter) ; (define (wordvector-ref obj idx) (cond-expand (64bit (pointer-u64-ref (object-data-offset obj idx))) (else (pointer-u32-ref (object-data-offset obj idx)))) ) ;;; Record Type (define-type bloom-filter (struct bloom-filter)) (: *make-bloom-filter (fixnum fixnum fixnum boolean-set bloom-filter-hashers message-digest-primitives -> bloom-filter)) (: bloom-filter? (* -> boolean : bloom-filter)) (: *bloom-filter-n (bloom-filter -> fixnum)) (: *bloom-filter-n-set! (bloom-filter fixnum -> void)) (: *bloom-filter-m (bloom-filter --> fixnum)) (: *bloom-filter-k (bloom-filter --> fixnum)) (: *bloom-filter-bits (bloom-filter -> boolean-set)) (: *bloom-filter-bits-set! (bloom-filter boolean-set -> void)) (: *bloom-filter-hashers (bloom-filter --> bloom-filter-hashers)) (: *bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) ; (define-record-type bloom-filter (*make-bloom-filter n m k bits hashes mdps) bloom-filter? (n *bloom-filter-n *bloom-filter-n-set!) (m *bloom-filter-m) (k *bloom-filter-k) (bits *bloom-filter-bits *bloom-filter-bits-set!) (hashes *bloom-filter-hashers) (mdps *bloom-filter-algorithms) ) ;;; Support ;; (: message-digest-primitive-lengths (message-digest-primitives --> (list-of fixnum))) ; (define (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) (: bloom-filter-indices (bloom-filter * --> list)) ; (define (bloom-filter-indices bf obj) (foldl (lambda (ls hasher) (hasher obj ls)) '() (*bloom-filter-hashers bf)) ) (: bloom-filter-k-indices (bloom-filter * --> list)) ; (define (bloom-filter-k-indices bf obj) (take! (bloom-filter-indices bf obj) (*bloom-filter-k bf)) ) (: bloom-filter-foldl (bloom-filter procedure * list --> *)) ; (define (bloom-filter-foldl bf func init obj) (foldl func init (bloom-filter-k-indices bf obj)) ) (: message-digest-result->integers (* fixnum fixnum fixnum (list-of fixnum) -> (list-of fixnum))) ; (define (message-digest-result->integers obj m wrdcnt bytrem ls) ; (define (whole-words) (assume ((wrdcnt fixnum)) (let loop ((idx (the fixnum 0)) (ints ls)) (if (>= idx wrdcnt) ints (let* ( (num (wordvector-ref obj idx)) (int (remainder num m)) ) (loop (add1 idx) (cons int ints)) ) ) ) ) ) ; (define (partial-word) (assume ((wrdcnt fixnum) (bytrem fixnum)) (let ( (ptr (object->pointer obj)) (bytoff (* wrdcnt MACHINE-WORD-SIZE)) ) (do ( (cnt bytrem (sub1 cnt)) (ptr (pointer+ ptr bytoff) (pointer+ ptr 1)) (int (the fixnum 0) (+ int (pointer-u8-ref ptr))) ) ((>= 0 cnt) int) ) ) ) ) ; (reverse! (cons (partial-word) (whole-words))) ) (: make-bloom-filter-hasher (message-digest-primitive fixnum -> bloom-filter-hasher)) ; (define (make-bloom-filter-hasher mdp m) (let ( (len (message-digest-primitive-digest-length mdp)) ) (let ( (wrdcnt (quotient len MACHINE-WORD-SIZE) ) (bytrem (modulo len MACHINE-WORD-SIZE) ) ) ;returns a list of hash values for the supplied object (lambda (obj ls) (let ( (blb (message-digest-object mdp obj 'blob)) ) (message-digest-result->integers blb m wrdcnt bytrem ls) ) ) ) ) ) ;;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound ;n : capacity, p : probability of false-positive ;=> m : bits, k : hashes (: optimum-size (float fixnum --> fixnum fixnum)) ; (define (optimum-size p n) (let* ( (nx (exact->inexact n)) (mx (ceiling (/ (* nx (log p)) -LN2^2))) (kx (ceiling (/ (* mx LN2) nx))) ) (values (inexact->exact mx) (inexact->exact kx)) ) ) (: optimum-k (fixnum fixnum --> fixnum)) ; (define (optimum-k n m) (let ( (nx (exact->inexact n)) (mx (exact->inexact m)) ) (inexact->exact (ceiling (* LN2 (/ mx nx)))) ) ) (: optimum-m (fixnum fixnum --> fixnum)) ; (define (optimum-m k n) (let ( (kx (exact->inexact k)) (nx (exact->inexact n)) ) (inexact->exact (ceiling (/ (* nx kx) LN2))) ) ) (: p-random-one-bit (fixnum fixnum fixnum --> float)) ; (define (p-random-one-bit k n m) (let ( (kx (exact->inexact k)) (nx (exact->inexact n)) (mx (exact->inexact m)) ) (- 1.0 (expt (- 1.0 (/ 1.0 mx)) (* kx nx))) ) ) (: p-false-positive (fixnum fixnum fixnum --> float)) ; (define (p-false-positive k n m) (let ( (kx (exact->inexact k)) ) (expt (p-random-one-bit k n m) kx) ) ) (: desired-m (float fixnum #!optional fixnum --> fixnum fixnum float)) ; (define (desired-m p n #!optional opt-k) (check-flonum 'desired-m p 'p) (let ( (opt-k (and opt-k (check-positive-fixnum 'desired-m opt-k 'optimal-k))) ) (let loop ((m (check-positive-fixnum 'desired-m n 'n))) (let* ( (k (or opt-k (optimum-k n m))) (calc-p (p-false-positive k n m)) ) (cond ((<= calc-p p) (values m k calc-p) ) ((< (- most-positive-fixnum m) n) (signal-bounds-error 'desired-m "cannot represent `m' as a fixnum" m n calc-p) ) (else ;FIXME the increment is too large for large n ? (loop (+ m n)) ) ) ) ) ) ) (: actual-k (message-digest-primitives --> fixnum)) ; (define (actual-k mdps) (let ((wrdcntr (lambda (tot len) (+ tot (quotient len MACHINE-WORD-SIZE))))) (foldl wrdcntr 0 (message-digest-primitive-lengths mdps)) ) ) ;;; Bloom Filter (define-check+error-type bloom-filter bloom-filter?) (: bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) ; (define (bloom-filter-algorithms bf) (list-copy (*bloom-filter-algorithms (check-bloom-filter 'bloom-filter-algorithms bf))) ) (: bloom-filter-n (bloom-filter -> fixnum)) ; (define (bloom-filter-n bf) (*bloom-filter-n (check-bloom-filter 'bloom-filter-n bf)) ) (: bloom-filter-m (bloom-filter --> fixnum)) ; (define (bloom-filter-m bf) (*bloom-filter-m (check-bloom-filter 'bloom-filter-m bf)) ) (: bloom-filter-k (bloom-filter --> fixnum)) ; (define (bloom-filter-k bf) (*bloom-filter-k (check-bloom-filter 'bloom-filter-k bf)) ) ;FIXME make-bloom-filter type is ugh ;( p n mdps) | ( m mdps [k]) (: make-bloom-filter ((or fixnum float) (or fixnum message-digest-primitives) #!optional (or fixnum message-digest-primitives) -> bloom-filter)) ; (define (make-bloom-filter m mdps #!optional des-k) ;processing ( m mdps [k] ) or ( p n mdps ) ? (if (list? mdps) (check-positive-fixnum 'make-bloom-filter m 'm) (let ( (p (check-flonum 'make-bloom-filter m 'p)) (n (check-positive-fixnum 'make-bloom-filter mdps 'n)) ) (check-open-interval 'make-bloom-filter p 0.0 1.0 'p) (set! mdps des-k) (set!-values (m des-k) (optimum-size p n)) ) ) ;algorithms (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) (check-list 'make-bloom-filter mdps 'mdps)) ;get the "desired" # of hash values (k) (let ( (act-k (actual-k mdps)) ) (if (not des-k) (set! des-k act-k) (when (< act-k (check-positive-fixnum 'make-bloom-filter des-k)) ;FIXME tell them how ! (error 'make-bloom-filter "insufficient hash functions supplied" act-k des-k) ) ) ) ;bloom filter is a multi-hash into a bitvector (let ( (bits (make-bit-vector m)) (hashers (map (cut make-bloom-filter-hasher <> m) mdps)) ) (*make-bloom-filter 0 m des-k bits hashers mdps) ) ) (: bloom-filter-p-false-positive (bloom-filter -> float)) ; (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (p-false-positive (*bloom-filter-k bf) (optional n (*bloom-filter-n bf)) (*bloom-filter-m bf)) ) (: bit-on! (boolean-set fixnum -> boolean-set)) ; (define (bit-on! bits idx) (bit-vector-set! bits idx #t) bits ) (: *make-bit-counter (boolean-set -> (fixnum fixnum -> fixnum))) ; (define-inline (*make-bit-counter bits) (lambda (cnt idx) (if (bit-vector-ref bits idx) (add1 cnt) cnt)) ) (: *bloom-filter-exists? (bloom-filter * -> boolean)) ; (define-inline (*bloom-filter-exists? bf obj) (let* ( (bits (*bloom-filter-bits bf)) (bitcnt (bloom-filter-foldl bf (*make-bit-counter bits) 0 obj)) ) (<= (*bloom-filter-k bf) bitcnt) ) ) (: bloom-filter-exists? (bloom-filter * --> boolean)) ; (define (bloom-filter-exists? bf obj) (*bloom-filter-exists? (check-bloom-filter 'bloom-filter-exists? bf) obj) ) (: bloom-filter-set! (bloom-filter * -> void)) ; (define (bloom-filter-set! bf obj) ;tracks actual pop (n) so cannot "reset" (unless (*bloom-filter-exists? (check-bloom-filter 'bloom-filter-set! bf) obj) ;spray rep bits (let ( (bits (bloom-filter-foldl bf bit-on! (*bloom-filter-bits bf) obj)) ) (*bloom-filter-bits-set! bf bits) ) ;bump actual pop (*bloom-filter-n-set! bf (add1 (*bloom-filter-n bf))) ) ) ) ;module bloom-filter