;; ;; ;; Perform some simple randomness tests on a sequence of numerical ;; values. ;; ;; Based on the ENT program by John Walker ;; ;; ;; Copyright 2007-2012 Ivan Raikov. ;; ;; ;; This program is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or (at ;; your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; .")))) ;; ;; (module random-test ( make-random-test format-random-stats ) (import scheme chicken foreign extras data-structures srfi-4) (require-extension srfi-4) ;; ;; ;; Chi-Square Test ;; ;; In general, the Chi-Square distribution for an experiment with k ;; possible outcomes, performed n times, in which Y1, Y2,... Yk are ;; the number of experiments which resulted in each possible outcome, ;; and probabilities of each outcome p1, p2,... pk, is given as: ;; ;; \Chi^2 = \sum_{1 <= i <= k}{(Y_i - m * p_i)^2 / np_i} ;; ;; \Chi^2 will grow larger as the measured results diverge from those ;; expected by pure chance. The probability Q that a Chi-Square value ;; calculated for an experiment with d degrees of freedom (where ;; d=k-1, one less the number of possible outcomes) is due to chance ;; is: ;; ;; Q(\Chi^2,d) = ;; [2^{d/2} * \Gamma(d/2)]^{-1} * ;; \int_{\Chi^2}^{\infty}(t)^{d/2-1} * exp(-t/2) * dt ;; ;; Where Gamma is the generalisation of the factorial function to real ;; and complex arguments: ;; ;; Gamma(x) = \int_{0}^{\infty} t^{x-1} * exp(-t) * dt ;; ;; ;; There is no closed form solution for Q, so it must be evaluated ;; numerically. Note that the probability calculated from the \Chi^2 ;; is an approximation which is valid only for large values of n, and ;; is therefore only meaningful when calculated from a large number of ;; independent experiments. ;; ;; In this implementation, the Chi-Square distribution is calculated ;; for the list of values given as argument to the random-test ;; procedure and expressed as an absolute number and a percentage ;; which indicates how frequently a truly random sequence would exceed ;; the value calculated. ;; ;; The percentage can be interpreted as the degree to which the ;; sequence tested is suspected of being non-random. If the percentage ;; is greater than 99% or less than 1%, the sequence is almost ;; certainly not random. If the percentage is between 99% and 95% or ;; between 1% and 5%, the sequence is suspect. Percentages between 90% ;; and 95% and 5% and 10% indicate the sequence is "almost ;; suspect". ;; ;; ;; Arithmetic Mean ;; ;; This is simply the result of summing the all the values in the ;; sequence and dividing by the sequence length. If the data are close ;; to random, the mean should be about (2^b - 1)/2 where b is the ;; number of bits used to represent a value. If the mean departs from ;; this value, the values are consistently high or low. ;; ;; ;; Monte Carlo Value for Pi ;; ;; Each pair of two values in the input sequence is used as X and ;; Y coordinates within a square with side N (the length of the input ;; sequence). If the distance of the randomly-generated point is less ;; than the radius of a circle inscribed within the square, the pair ;; of values is considered a "hit". The percentage of hits can be used ;; to calculate the value of Pi: ;; ;; ;; # points within circle 1/4 * pi * r^2 ;; ---------------------- = -------------- = 1/4 * pi ;; # points within square r^2 ;; ;; ;; Therefore, ;; # points within circle ;; pi = 4 * ---------------------- ;; # points within square ;; ;; ;; For very long sequences (this approximation converges very ;; slowly), the value will approach the correct value of Pi if the ;; sequence is close to random. ;; ;; ;; Serial Correlation Coefficient ;; ;; This statistic measures the extent to which each value in the ;; sequence depends upon the previous value. For random sequences, ;; this value (which can be positive or negative) must be close to ;; zero. ;; ;; (define pochisq (foreign-lambda double "pochisq" double double)) (define log2 (foreign-lambda double "log2" double)) ;; Number of bins for Chi-Square test (define N 1024) ;; Circle radius for the Monte Carlo calculation of Pi (define R N) ;; Number of coordinates for the Monte Carlo calculation of Pi (define M 2) (define PI 3.14159265358979323846) (define (make-random-test . rest) (let-optionals rest ((scar car) (scdr cdr) (snull? null?)) (lambda (seq) ;; Bins to count occurrences of values (define bins (make-u32vector N 0)) ;; Monte Carlo coordinates (define mcoord (make-u32vector M 0)) (let ((nmin 0.0) ;; Minimum and maximum values encountered (nmax 0.0) (mp 0) ;; Monte Carlo accumulator pointer (mcount 0) ;; Monte Carlo tries (incircle 0)) ;; Monte Carlo circle inside count (let* ;; Serial correlation terms ((sccn0 (and seq (scar seq))) (sccn-1 #f) (scct1 0.0) (scct2 0.0) (scct3 0.0)) (let loop ((total 0) (seq seq)) (if (snull? seq) (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn-1 ) (let ((n (scar seq))) ;; Updated minimum and maximum ;; Update bin counters (let ((nbin (inexact->exact (modulo n N)))) (u32vector-set! bins nbin (fx+ 1 (u32vector-ref bins nbin))) (let ((bi (u32vector-ref bins nbin))) (if (< bi nmin) (set! nmin bi)) (if (> bi nmax) (set! nmax bi))) ;; Update inside / outside circle counts for Monte Carlo ;; calculation of Pi (u32vector-set! mcoord mp nbin) (set! mp (fx+ 1 mp)) (if (fx<= M mp) (begin (set! mcount (fx+ mcount 1)) (set! mp 0) (let ((x (u32vector-ref mcoord 0)) (y (u32vector-ref mcoord 1))) ;; determine if the random "coordinates" are ;; inside or outside the circle (let ((dist (sqrt (+ (expt x 2) (expt y 2))))) (if (<= dist R) (set! incircle (fx+ 1 incircle))))))) ;; Update calculation of serial correlation coefficient (if sccn-1 (set! scct1 (+ scct1 (* sccn-1 n)))) (set! scct2 (+ scct2 n)) (set! scct3 (+ scct3 (* n n))) (set! sccn-1 n) (loop (fx+ 1 total) (scdr seq))))))))))) (define (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn) ;; Probabilities per bin for entropy (define probs (make-f64vector N 0)) ;; Compute serial correlation coefficient (let* ((scct1 (+ scct1 (* sccn sccn0))) (scct2 (* scct2 scct2)) (sccdenom (- (* total scct3) scct2)) (scc (if (zero? sccdenom) -10000.0 (/ (- (* total scct1) scct2) sccdenom)))) ;; Scan bins and calculate probability for each bin and ;; Chi-Square distribution (let ((cexp (/ total N))) ;; Expected count per bin (let-values (((chisq pochisq datasum) (let loop ((i 0) (chisq 0.0) (datasum 0.0)) (if (fx< i N) (let ((bi (u32vector-ref bins i))) (let ((a (- bi cexp)) (p (/ bi total))) (f64vector-set! probs i p) (loop (fx+ 1 i) (+ chisq (/ (* a a) cexp)) (+ datasum (* i bi))))) (values chisq (pochisq chisq (- total 1)) datasum))))) ;; Calculate entropy (let ((ent (let loop ((i 0) (ent 0)) (if (fx< i N) (let ((p (f64vector-ref probs i))) (loop (fx+ 1 i) (if (< 0.0 p) (+ ent (* p (log2 (/ 1.0 p)))) ent))) ent)))) ;; Calculate Monte Carlo value for Pi from percentage of hits ;; within the circle (let ((montepi (* 4 (/ incircle mcount)))) `((total . ,total) (ent . ,ent) (chisq . ,chisq) (pochisq . ,pochisq) (mean . ,(/ datasum total)) (min . ,nmin) (max . ,nmax) (montepi . ,montepi) (scc . ,scc)))))))) (define (format-random-stats out stats) (fprintf out "Entropy is ~S.\n" (alist-ref 'ent stats)) (fprintf out "Chi-Square distribution for ~S samples is ~S (~S percent random).\n" (alist-ref 'total stats) (alist-ref 'chisq stats) (* 100 (alist-ref 'pochisq stats))) (fprintf out "Arithmetic mean value of the sequence is ~S (~S = random).\n" (alist-ref 'mean stats) (/ (alist-ref 'total stats) N)) (fprintf out "Monte-Carlo value for Pi is ~S (error ~S percent).\n" (alist-ref 'montepi stats) (* 100 (/ (abs (- PI (alist-ref 'montepi stats))) PI))) (fprintf out "Serial correlation coefficient is ~S.\n" (let ((scc (alist-ref 'scc stats))) (if (>= scc -99999) scc "undefined (all values equal)")))) ; Include into generated code, but don't parse: #> #include double poz (double z); #define LOG_SQRT_PI 0.5723649429247000870717135 /* log (sqrt (pi)) */ #define I_SQRT_PI 0.5641895835477562869480795 /* 1 / sqrt (pi) */ #define BIGX 20.0 /* max value to represent exp (x) */ #define ex(x) (((x) < -BIGX) ? 0.0 : exp (x)) /* ALGORITHM Compute probability of chi square value. ;; Adapted from: ;; Hill, I. D. and Pike, M. C. Algorithm 299 ;; Collected Algorithms for the CACM 1967 p. 243 ;; Updated for rounding errors based on remark in ;; ACM TOMS June 1985, page 185 */ /* ;; x: obtained chi-square value ;; df: degrees of freedom */ double pochisq (double x, double df) { double a, y, s; double e, c, z; int even; /* true if df is an even number */ if (x <= 0.0 || df < 1) return (1.0); a = 0.5 * x; even = (2*(df/2)) == df; if (df > 1) y = ex (-a); s = (even ? y : (2.0 * poz (-sqrt (x)))); if (df > 2) { x = 0.5 * (df - 1.0); z = (even ? 1.0 : 0.5); if (a > BIGX) { e = (even ? 0.0 : LOG_SQRT_PI); c = log (a); while (z <= x) { e = log (z) + e; s += ex (c*z-a-e); z += 1.0; } return (s); } else { e = (even ? 1.0 : (I_SQRT_PI / sqrt (a))); c = 0.0; while (z <= x) { e = e * (a / z); c = c + e; z += 1.0; } return (c * y + s); } } else return (s); } /* Maximum meaningful z value */ #define Z_MAX 6.0 /* ;; FUNCTION poz: probability of normal z value ;; ;; Adapted from a polynomial approximation in: ;; Ibbetson D, Algorithm 209 ;; Collected Algorithms of the CACM 1963 p. 616 ;; Note: ;; ;; This routine has six digit accuracy, so it is only ;; useful for absolute z values < 6. For z values >= to ;; 6.0, poz() returns 0.0. ;; */ double poz (double z) { double y, x, w; if (z == 0.0) x = 0.0; else { y = 0.5 * fabs (z); if (y >= (Z_MAX * 0.5)) { x = 1.0; } else if (y < 1.0) { w = y * y; x = ((((((((0.000124818987 * w -0.001075204047) * w +0.005198775019) * w -0.019198292004) * w +0.059054035642) * w -0.151968751364) * w +0.319152932694) * w -0.531923007300) * w +0.797884560593) * y * 2.0; } else { y -= 2.0; x = (((((((((((((-0.000045255659 * y +0.000152529290) * y -0.000019538132) * y -0.000676904986) * y +0.001390604284) * y -0.000794620820) * y -0.002034254874) * y +0.006549791214) * y -0.010557625006) * y +0.011630447319) * y -0.009279453341) * y +0.005353579108) * y -0.002141268741) * y +0.000535310849) * y +0.999936657524; } } return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)); } <# )