;;;; bloom-filter.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Jul '18 ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'message-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower all of the hashes are generated anyway. It ;; is up to the caller to optimize. ;; ;; - Use of 'iset' is slower than bit-vector. ;; ;; - Type `float' restrictions can be replaced w/ `real' but this allows some ;; compiler optimization w/o use of `fp' routines. ;; ;; - `inexact' is treated as `float' synonym; sloppy. (module bloom-filter (;export ; optimum-size optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k ; bloom-filter make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive ; bloom-filter-set! bloom-filter-exists?) (import scheme (chicken base) (only (chicken fixnum) most-positive-fixnum fx< fx= fx+ fx-) (chicken type) (chicken memory) (chicken foreign) (only (srfi 1) list-copy take! reverse!) iset message-digest-primitive message-digest-type message-digest-item (only (check-errors sys) check-list check-inexact check-fixnum-in-range) (only type-checks-basic define-check+error-type) (only type-errors-basic signal-bounds-error make-error-interval-message)) (define (check-positive-fixnum loc obj) (import (only (chicken fixnum) most-positive-fixnum)) (check-fixnum-in-range loc obj 1 most-positive-fixnum) ) (define (error-interval loc num lft min max rgt #!optional argnam) (signal-bounds-error loc (make-error-interval-message lft min max rgt argnam) num) ) (define (error-open-interval loc num min max #!optional argnam) (error-interval loc num #\( min max #\) argnam) ) (define (check-open-interval loc num min max . args) (unless (and (< min num) (< num max)) (error-open-interval loc num min max (optional args))) num ) ;FIXME should be able to get type as module export (define-type iset:integer-set *) (define-type boolean-set iset:integer-set) (include "message-digest-primitive.types") (include "message-digest-type.types") (define-type message-digest-primitives (list-of message-digest-primitive)) (define-type bloom-filter-hasher (* (list-of fixnum) --> (list-of fixnum))) (define-type bloom-filter-hashers (list-of bloom-filter-hasher)) (define-type bloom-filter (struct bloom-filter)) (define-type bloom-filter-bit-counter (fixnum fixnum --> fixnum)) (: *make-bloom-filter (fixnum fixnum fixnum boolean-set bloom-filter-hashers message-digest-primitives --> bloom-filter)) (: *bloom-filter-n (bloom-filter --> fixnum)) (: *bloom-filter-n-set! (bloom-filter fixnum -> void)) (: *bloom-filter-m (bloom-filter --> fixnum)) (: *bloom-filter-k (bloom-filter --> fixnum)) (: *bloom-filter-bits (bloom-filter --> boolean-set)) (: *bloom-filter-bits-set! (bloom-filter boolean-set -> void)) (: *bloom-filter-hashers (bloom-filter --> bloom-filter-hashers)) (: *bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) (: message-digest-primitive-lengths (message-digest-primitives --> (list-of fixnum))) (: bloom-filter-k-indices (bloom-filter * #!optional fixnum --> list)) (: blob->hashed-indices (blob fixnum fixnum fixnum (list-of fixnum) --> (list-of fixnum))) (: make-bloom-filter-hasher (message-digest-primitive fixnum --> bloom-filter-hasher)) (: bit-on! (boolean-set fixnum -> boolean-set)) (: make-bit-counter (boolean-set --> bloom-filter-bit-counter)) (: *bloom-filter-exists? (bloom-filter (list-of fixnum) --> boolean)) (: optimum-size (float fixnum --> fixnum fixnum)) (: optimum-k (fixnum fixnum --> fixnum)) (: optimum-m (fixnum fixnum --> fixnum)) (: p-random-one-bit (fixnum fixnum fixnum --> float)) (: p-false-positive (fixnum fixnum fixnum --> float)) (: desired-m (float fixnum #!optional fixnum --> fixnum fixnum float)) (: actual-k (message-digest-primitives --> fixnum)) (: make-bloom-filter ((or fixnum float) (or fixnum message-digest-primitives) #!optional (or fixnum message-digest-primitives) --> bloom-filter)) (: bloom-filter? (* -> boolean : bloom-filter)) (: bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) (: bloom-filter-n (bloom-filter --> fixnum)) (: bloom-filter-m (bloom-filter --> fixnum)) (: bloom-filter-k (bloom-filter --> fixnum)) (: bloom-filter-p-false-positive (bloom-filter --> float)) (: bloom-filter-exists? (bloom-filter * --> boolean)) (: bloom-filter-set! (bloom-filter * -> boolean)) ;;; ;mathh (define-constant LN2 0.69314718055994528622676398299518041312694549560546875) ;ln(2) (define-constant -LN2^2 -0.48045301391820138814381380143458954989910125732421875) ;-(ln(2)^2) (cond-expand (64bit (define-constant WORD-SIZE (/ 64 8)) ) (else (define-constant WORD-SIZE (/ 32 8)) ) ) (: blob-byte-ref (blob fixnum --> fixnum)) (: blob-word-ref (blob fixnum --> integer)) (: blob-word-indice-ref (blob fixnum fixnum --> integer)) ;; ;NOTE datum allocation on "word" boundary (at least) ; ;NOTE `byts' type is "weaker" here than the Scheme type, `blob' only, mainly due to ;ease of implementation. ; (cond-expand (64bit (define blob-byte-ref (foreign-lambda* unsigned-byte (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer64 off)) " return( byts[off] );")) #; ;C > Scheme (define blob-word-ref (foreign-lambda* unsigned-integer64 (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer64 off)) " return( *(uint64_t *)(&byts[off]) );")) ;#; ;C > Scheme (define blob-word-indice-ref (foreign-lambda* unsigned-integer64 (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer64 off) (unsigned-integer64 mod)) " return( (*(uint64_t *)(&byts[off])) % mod );")) ) (else (define blob-byte-ref (foreign-lambda* unsigned-byte (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer32 off)) " return( byts[off] );")) #; ;C > Scheme (define blob-word-ref (foreign-lambda* unsigned-integer32 (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer32 off)) " return( *(uint32_t *)(&byts[off]) );")) ) ;#; ;C > Scheme (define blob-word-indice-ref (foreign-lambda* unsigned-integer32 (((nonnull-scheme-pointer unsigned-byte) byts) (unsigned-integer32 off) (unsigned-integer32 mod)) " return( (*(uint32_t *)(&byts[off])) % mod );")) ) #; ;C > Scheme this is smaller than the pure C version (define-inline (blob-word-indice-ref blb off mod) (modulo (blob-word-ref blb off) mod)) ;;; Record Type (define-record-type bloom-filter (*make-bloom-filter n m k bits hashes mdps) bloom-filter? (n *bloom-filter-n *bloom-filter-n-set!) (m *bloom-filter-m) (k *bloom-filter-k) (bits *bloom-filter-bits *bloom-filter-bits-set!) (hashes *bloom-filter-hashers) (mdps *bloom-filter-algorithms) ) ;; Support ;; (define (bit-on! bits idx) (bit-vector-set! bits idx #t) bits ) (define (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) ;FIXME `k' generality is unused ;FIXME use length counter (define (bloom-filter-k-indices bf obj #!optional (k (*bloom-filter-k bf))) (let loop ((hashers (*bloom-filter-hashers bf)) (ls '())) (if (or (null? hashers) (<= k (length ls))) (take! ls k) (loop (cdr hashers) ((car hashers) obj ls)) ) ) ) (define (make-bit-counter bits) (lambda (cnt idx) (if (bit-vector-ref bits idx) (add1 cnt) cnt)) ) (define (*bloom-filter-exists? bf idxs) (let* ((bits (*bloom-filter-bits bf)) (bitcnt (foldl (make-bit-counter bits) 0 idxs)) ) (<= (*bloom-filter-k bf) bitcnt) ) ) ;FIXME need better message-digest => list-of index ;need K hashes from M message-digests (define (blob->hashed-indices blb mod wrdcnt bytrem ls) (let sum-words ((cnt wrdcnt) (off 0) (ls ls)) (cond ((fx< 0 cnt) (sum-words (fx- cnt 1) (fx+ off WORD-SIZE) (cons (blob-word-indice-ref blb off mod) ls)) ) ((fx= 0 bytrem) ls ) (else (let sum-bytes ((cnt bytrem) (off off) (num 0)) (if (fx< 0 cnt) (sum-bytes (fx- cnt 1) (fx+ off 1) (fx+ num (blob-byte-ref blb off))) (cons (modulo num mod) ls) ) ) ) ) ) ) (define (make-bloom-filter-hasher mdp m) (let-values (((wrdcnt bytrem) (quotient&modulo (message-digest-primitive-digest-length mdp) WORD-SIZE)) ) ;returns a list of hash values for the supplied object (lambda (obj ls) (let ((blb (message-digest-object mdp obj 'blob))) (blob->hashed-indices blb m wrdcnt bytrem ls) ) ) ) ) ;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound ;n : capacity, p : probability of false-positive ;=> m : bits, k : hashes ; (define (optimum-size p n) (let* ((m (ceiling (/ (* n (log p)) -LN2^2))) (k (ceiling (/ (* m LN2) n))) ) (values (inexact->exact m) (inexact->exact k)) ) ) (define (optimum-k n m) (inexact->exact (ceiling (* LN2 (/ m n))))) (define (optimum-m k n) (inexact->exact (ceiling (/ (* n k) LN2)))) ;exact arithmetic expt is very slow (define (p-random-one-bit k n m) (- 1.0 (expt (- 1.0 (/ 1.0 m)) (* k n)))) (define (p-false-positive k n m) (expt (p-random-one-bit k n m) k)) (define (desired-m p n #!optional opt-k) (check-inexact 'desired-m p) (let ((opt-k (and opt-k (check-positive-fixnum 'desired-m opt-k)))) (let loop ((m (check-positive-fixnum 'desired-m n))) (let* ((k (or opt-k (optimum-k n m))) (calc-p (p-false-positive k n m)) ) (cond ((<= calc-p p) (values m k calc-p) ) ((< (- most-positive-fixnum m) n) (signal-bounds-error 'desired-m "cannot represent `m' as a fixnum" m n calc-p) ) (else ;FIXME the increment is too large for large n ? (loop (+ m n)) ) ) ) ) ) ) ;actual-hash-count is extended by treating the J digest words individually (define (actual-k mdps) (foldl (lambda (tot len) (let-values (((wrdcnt bytrem) (quotient&modulo len WORD-SIZE))) (+ tot wrdcnt (if (zero? bytrem) 0 1)) ) ) 0 (message-digest-primitive-lengths mdps)) ) ;; Bloom Filter (define-check+error-type bloom-filter bloom-filter?) (define (bloom-filter-algorithms bf) (list-copy (*bloom-filter-algorithms (check-bloom-filter 'bloom-filter-algorithms bf))) ) (define (bloom-filter-n bf) (*bloom-filter-n (check-bloom-filter 'bloom-filter-n bf)) ) (define (bloom-filter-m bf) (*bloom-filter-m (check-bloom-filter 'bloom-filter-m bf)) ) (define (bloom-filter-k bf) (*bloom-filter-k (check-bloom-filter 'bloom-filter-k bf)) ) ;FIXME make-bloom-filter type is ugh ;( p n mdps) | ( m mdps [k]) ; (define (make-bloom-filter m mdps #!optional des-k) ;processing ( m mdps [k] ) or ( p n mdps ) ? (let ((m m) (mdps mdps) (des-k des-k) ) (if (list? mdps) (check-positive-fixnum 'make-bloom-filter m) (let ((p (check-inexact 'make-bloom-filter m)) (n (check-positive-fixnum 'make-bloom-filter mdps)) ) (check-open-interval 'make-bloom-filter p 0.0 1.0 'p) (set! mdps des-k) (set!-values (m des-k) (optimum-size p n)) ) ) ;algorithms (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) (check-list 'make-bloom-filter mdps)) ;get the "desired" # of hash values (k) (let ((act-k (actual-k mdps))) (if (not des-k) (set! des-k act-k) (when (< act-k (check-positive-fixnum 'make-bloom-filter des-k)) ;FIXME tell them how ! (error 'make-bloom-filter "insufficient hash functions supplied" act-k des-k) ) ) ) ;bloom filter is a multi-hash into a bitvector (let ((bits (make-bit-vector m)) (hashers (map (cut make-bloom-filter-hasher <> m) mdps)) ) (*make-bloom-filter 0 m des-k bits hashers mdps) ) ) ) (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (p-false-positive (*bloom-filter-k bf) (optional n (*bloom-filter-n bf)) (*bloom-filter-m bf)) ) (define (bloom-filter-exists? bf obj) (let ((idxs (bloom-filter-k-indices (check-bloom-filter 'bloom-filter-exists? bf) obj))) (*bloom-filter-exists? bf idxs) ) ) (define (bloom-filter-set! bf obj) ;tracks actual pop (n) so cannot "reset" (let ((idxs (bloom-filter-k-indices (check-bloom-filter 'bloom-filter-set! bf) obj))) (cond ((*bloom-filter-exists? bf idxs) ;collision #f) (else ;spray rep bits (*bloom-filter-bits-set! bf (foldl bit-on! (*bloom-filter-bits bf) idxs)) ;bump actual pop (*bloom-filter-n-set! bf (add1 (*bloom-filter-n bf))) ;no collision #t) ) ) ) ) ;module bloom-filter