;;;; bloom-filter.scm -*- Scheme -*- ;;;; Kon Lovett, Mar '20 ;;;; Kon Lovett, Jul '18 ;;;; Kon Lovett, Jun '06 ;; Issues ;; ;; - Uses 'message-digest-object' to create a digest. Some overhead if actual ;; object type is known. ;; ;; - Cannot change digest generator after filter creation. ;; ;; - Note the use of "in-place" list mutators, '(*! ...). Be vewy careful. ;; ;; - Although K may be lower all of the hashes are generated anyway. It ;; is up to the caller to optimize. ;; ;; - Use of 'iset' is slower than bit-vector. ;; ;; - Type `float' restrictions can be replaced w/ `real' but this allows some ;; compiler optimization w/o use of `fp' routines. ;; ;; - `inexact' is treated as `float' synonym; sloppy. (module bloom-filter (;export ; optimum-size optimum-k optimum-m p-random-one-bit p-false-positive desired-m actual-k ; bloom-filter make-bloom-filter bloom-filter? check-bloom-filter error-bloom-filter bloom-filter-algorithms bloom-filter-n bloom-filter-m bloom-filter-k bloom-filter-p-false-positive ; bloom-filter-set! bloom-filter-exists?) (import scheme (chicken base) (only (chicken fixnum) most-positive-fixnum fx* fx+ fx-) (chicken type) (chicken memory) (only (srfi 1) list-copy take! reverse!) iset message-digest-primitive message-digest-type message-digest-item (only (check-errors sys) check-list check-inexact check-fixnum-in-range) (only type-checks-basic define-check+error-type) (only type-errors-basic signal-bounds-error make-error-interval-message)) (define (check-positive-fixnum loc obj) (import (only (chicken fixnum) most-positive-fixnum)) (check-fixnum-in-range loc obj 1 most-positive-fixnum) ) (define (error-interval loc num lft min max rgt #!optional argnam) (signal-bounds-error loc (make-error-interval-message lft min max rgt argnam) num) ) (define (error-open-interval loc num min max #!optional argnam) (error-interval loc num #\( min max #\) argnam) ) (define (check-open-interval loc num min max . args) (unless (and (< min num) (< num max)) (error-open-interval loc num min max (optional args))) num ) ;FIXME should be able to get type as module export (define-type iset:integer-set *) (define-type boolean-set iset:integer-set) (include "message-digest-primitive.types") (include "message-digest-type.types") (define-type message-digest-primitives (list-of message-digest-primitive)) (define-type bloom-filter-hasher (* (list-of fixnum) --> (list-of fixnum))) (define-type bloom-filter-hashers (list-of bloom-filter-hasher)) (define-type bloom-filter (struct bloom-filter)) (define-type bloom-filter-bit-counter (fixnum fixnum --> fixnum)) (: wordvector-ref (pointer --> integer)) (: blob-word-ref (blob fixnum --> integer)) (: *make-bloom-filter (fixnum fixnum fixnum boolean-set bloom-filter-hashers message-digest-primitives --> bloom-filter)) (: *bloom-filter-n (bloom-filter --> fixnum)) (: *bloom-filter-n-set! (bloom-filter fixnum -> void)) (: *bloom-filter-m (bloom-filter --> fixnum)) (: *bloom-filter-k (bloom-filter --> fixnum)) (: *bloom-filter-bits (bloom-filter --> boolean-set)) (: *bloom-filter-bits-set! (bloom-filter boolean-set -> void)) (: *bloom-filter-hashers (bloom-filter --> bloom-filter-hashers)) (: *bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) (: message-digest-primitive-lengths (message-digest-primitives --> (list-of fixnum))) (: bloom-filter-indices (bloom-filter * --> list)) (: bloom-filter-k-indices (bloom-filter * --> list)) (: blob->hashed-indices (blob fixnum fixnum fixnum (list-of fixnum) --> (list-of fixnum))) (: make-bloom-filter-hasher (message-digest-primitive fixnum --> bloom-filter-hasher)) (: bit-on! (boolean-set fixnum -> boolean-set)) (: make-bit-counter (boolean-set --> bloom-filter-bit-counter)) (: *bloom-filter-exists? (bloom-filter (list-of fixnum) --> boolean)) (: optimum-size (float fixnum --> fixnum fixnum)) (: optimum-k (fixnum fixnum --> fixnum)) (: optimum-m (fixnum fixnum --> fixnum)) (: p-random-one-bit (fixnum fixnum fixnum --> float)) (: p-false-positive (fixnum fixnum fixnum --> float)) (: desired-m (float fixnum #!optional fixnum --> fixnum fixnum float)) (: actual-k (message-digest-primitives --> fixnum)) (: make-bloom-filter ((or fixnum float) (or fixnum message-digest-primitives) #!optional (or fixnum message-digest-primitives) --> bloom-filter)) (: bloom-filter? (* -> boolean : bloom-filter)) (: bloom-filter-algorithms (bloom-filter --> message-digest-primitives)) (: bloom-filter-n (bloom-filter --> fixnum)) (: bloom-filter-m (bloom-filter --> fixnum)) (: bloom-filter-k (bloom-filter --> fixnum)) (: bloom-filter-p-false-positive (bloom-filter --> float)) (: bloom-filter-exists? (bloom-filter * --> boolean)) (: bloom-filter-set! (bloom-filter * -> boolean)) ;; ;mathh (define-constant LN2 0.69314718055994528622676398299518041312694549560546875) ;ln(2) (define-constant -LN2^2 -0.48045301391820138814381380143458954989910125732421875) ;-(ln(2)^2) ;; #; ;FIXME non-determinism easier to generate w/ 32bit (begin (define-constant MACHINE-WORD-SIZE (/ 32 8)) (define (wordvector-ref ptr) (pointer-u32-ref ptr)) ) ;#; ;FIXME non-determinism easier to generate w/ 32bit (cond-expand (64bit (define-constant MACHINE-WORD-SIZE (/ 64 8)) (define (wordvector-ref ptr) (pointer-u64-ref ptr)) ) (else (define-constant MACHINE-WORD-SIZE (/ 32 8)) (define (wordvector-ref ptr) (pointer-u32-ref ptr)) ) ) ;returns pointer offset n words into the object (define (blob-word-ref obj n) ;skip over the machine-word header (wordvector-ref (pointer+ (object->pointer obj) (fx* (fx+ n 1) MACHINE-WORD-SIZE))) ) ;;; Record Type (define-record-type bloom-filter (*make-bloom-filter n m k bits hashes mdps) bloom-filter? (n *bloom-filter-n *bloom-filter-n-set!) (m *bloom-filter-m) (k *bloom-filter-k) (bits *bloom-filter-bits *bloom-filter-bits-set!) (hashes *bloom-filter-hashers) (mdps *bloom-filter-algorithms) ) ;; Support ;; (define (bit-on! bits idx) (bit-vector-set! bits idx #t) bits ) (define (message-digest-primitive-lengths mdps) (map message-digest-primitive-digest-length mdps) ) (define (bloom-filter-indices bf obj) (foldl (lambda (ls hasher) (hasher obj ls)) '() (*bloom-filter-hashers bf)) ) (define (bloom-filter-k-indices bf obj) (take! (bloom-filter-indices bf obj) (*bloom-filter-k bf)) ) (define (make-bit-counter bits) (lambda (cnt idx) (if (bit-vector-ref bits idx) (add1 cnt) cnt)) ) (define (*bloom-filter-exists? bf idxs) (let* ((bits (*bloom-filter-bits bf)) (bitcnt (foldl (make-bit-counter bits) 0 idxs)) ) (<= (*bloom-filter-k bf) bitcnt) ) ) ;FIXME need better message-digest => list-of index (define (blob->hashed-indices obj m wrdcnt bytrem ls) (let sum-whole ((cnt wrdcnt) (ls ls)) (if (positive? cnt) (let ((id (modulo (blob-word-ref obj (fx- wrdcnt cnt)) m))) (sum-whole (fx- cnt 1) (cons id ls)) ) ls ) ) ) #; (define (blob->hashed-indices obj m wrdcnt bytrem ls) (let sum-whole ((cnt wrdcnt) (ls ls)) (if (positive? cnt) (let ((id (modulo (blob-word-ref obj (fx- wrdcnt cnt)) m))) (sum-whole (fx- cnt 1) (cons id ls)) ) (if (zero? bytrem) ls ;FIXME little vs big endian (let sum-partial ((ptr ptr) (cnt bytrem) (num 0)) (import (only (chicken bitwise) arithmetic-shift)) (if (zero? cnt) (cons (modulo num m) ls) (sum-partial (pointer+ ptr 1) (sub1 cnt) (+ (arithmetic-shift num 8) (pointer-u8-ref ptr))) ) ) ) ) ) ) #; ;need K hashes from M message-digests (define (blob->hashed-indices obj m wrdcnt bytrem ls) (import (only (chicken bitwise) arithmetic-shift)) (let ((bytcnt (+ (* wrdcnt MACHINE-WORD-SIZE) bytrem))) (let sum-hash ((ptr (object-data-pointer obj)) (cnt bytcnt) (num 0)) (if (zero? cnt) (cons (modulo num m) ls) (sum-hash (pointer+ ptr 1) (sub1 cnt) (+ (arithmetic-shift num 8) (pointer-u8-ref ptr))) ) ) ) ) (define (make-bloom-filter-hasher mdp m) #; ;show hash results (import format) (let-values (((wrdcnt bytrem) (quotient&modulo (message-digest-primitive-digest-length mdp) MACHINE-WORD-SIZE)) ) ;(assert (zero? bytrem)) ;digest-length always multiple of wordsize? ;returns a list of hash values for the supplied object (lambda (obj ls) (let ((blb (message-digest-object mdp obj 'blob))) #; ;show hash results (format #t "~18A ~10A ~S~%" obj (message-digest-primitive-name mdp) blb) (blob->hashed-indices blb m wrdcnt bytrem ls) ) ) ) ) ;; Calculators ;; Actual optimal: (expt (* n (log2 (/ m (- m 1)))) -1) ;; Returns the upper-bound ;n : capacity, p : probability of false-positive ;=> m : bits, k : hashes ; (define (optimum-size p n) (let* ((m (ceiling (/ (* n (log p)) -LN2^2))) (k (ceiling (/ (* m LN2) n))) ) (values (inexact->exact m) (inexact->exact k)) ) ) (define (optimum-k n m) (inexact->exact (ceiling (* LN2 (/ m n))))) (define (optimum-m k n) (inexact->exact (ceiling (/ (* n k) LN2)))) ;exact arithmetic expt is very slow (define (p-random-one-bit k n m) (- 1.0 (expt (- 1.0 (/ 1.0 m)) (* k n)))) (define (p-false-positive k n m) (expt (p-random-one-bit k n m) k)) (define (desired-m p n #!optional opt-k) (check-inexact 'desired-m p) (let ((opt-k (and opt-k (check-positive-fixnum 'desired-m opt-k)))) (let loop ((m (check-positive-fixnum 'desired-m n))) (let* ((k (or opt-k (optimum-k n m))) (calc-p (p-false-positive k n m)) ) (cond ((<= calc-p p) (values m k calc-p) ) ((< (- most-positive-fixnum m) n) (signal-bounds-error 'desired-m "cannot represent `m' as a fixnum" m n calc-p) ) (else ;FIXME the increment is too large for large n ? (loop (+ m n)) ) ) ) ) ) ) (define (actual-k mdps) #; ;incorrect even when including partial-words (ceiling (/ (apply + (message-digest-primitive-lengths mdps)) MACHINE-WORD-SIZE)) ;total only full-word results (foldl (lambda (tot len) (+ tot (quotient len MACHINE-WORD-SIZE))) 0 (message-digest-primitive-lengths mdps)) ) ;; Bloom Filter (define-check+error-type bloom-filter bloom-filter?) (define (bloom-filter-algorithms bf) (list-copy (*bloom-filter-algorithms (check-bloom-filter 'bloom-filter-algorithms bf))) ) (define (bloom-filter-n bf) (*bloom-filter-n (check-bloom-filter 'bloom-filter-n bf)) ) (define (bloom-filter-m bf) (*bloom-filter-m (check-bloom-filter 'bloom-filter-m bf)) ) (define (bloom-filter-k bf) (*bloom-filter-k (check-bloom-filter 'bloom-filter-k bf)) ) ;FIXME make-bloom-filter type is ugh ;( p n mdps) | ( m mdps [k]) ; (define (make-bloom-filter m mdps #!optional des-k) ;processing ( m mdps [k] ) or ( p n mdps ) ? (let ((m m) (mdps mdps) (des-k des-k) ) (if (list? mdps) (check-positive-fixnum 'make-bloom-filter m) (let ((p (check-inexact 'make-bloom-filter m)) (n (check-positive-fixnum 'make-bloom-filter mdps)) ) (check-open-interval 'make-bloom-filter p 0.0 1.0 'p) (set! mdps des-k) (set!-values (m des-k) (optimum-size p n)) ) ) ;algorithms (for-each (cut check-message-digest-primitive 'make-bloom-filter <>) (check-list 'make-bloom-filter mdps)) ;get the "desired" # of hash values (k) (let ((act-k (actual-k mdps))) (if (not des-k) (set! des-k act-k) (when (< act-k (check-positive-fixnum 'make-bloom-filter des-k)) ;FIXME tell them how ! (error 'make-bloom-filter "insufficient hash functions supplied" act-k des-k) ) ) ) ;bloom filter is a multi-hash into a bitvector (let ((bits (make-bit-vector m)) (hashers (map (cut make-bloom-filter-hasher <> m) mdps)) ) (*make-bloom-filter 0 m des-k bits hashers mdps) ) ) ) (define (bloom-filter-p-false-positive bf . n) (check-bloom-filter 'bloom-filter-p-false-positive bf) (p-false-positive (*bloom-filter-k bf) (optional n (*bloom-filter-n bf)) (*bloom-filter-m bf)) ) (define (bloom-filter-exists? bf obj) (let ((idxs (bloom-filter-k-indices (check-bloom-filter 'bloom-filter-exists? bf) obj))) (*bloom-filter-exists? bf idxs) ) ) (define (bloom-filter-set! bf obj) ;tracks actual pop (n) so cannot "reset" (let ((idxs (bloom-filter-k-indices (check-bloom-filter 'bloom-filter-set! bf) obj))) (cond ((*bloom-filter-exists? bf idxs) ;collision #f) (else ;spray rep bits (*bloom-filter-bits-set! bf (foldl bit-on! (*bloom-filter-bits bf) idxs)) ;bump actual pop (*bloom-filter-n-set! bf (add1 (*bloom-filter-n bf))) ;no collision #t) ) ) ) ) ;module bloom-filter