;;
;;
;; Perform some simple randomness tests on a sequence of numerical
;; values.
;;
;; Based on the ENT program by John Walker
;;
;;
;; Copyright 2007-2012 Ivan Raikov.
;;
;;
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or (at
;; your option) any later version.
;;
;; This program is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; A full copy of the GPL license can be found at
;; ."))))
;;
;;
(module random-test
( make-random-test format-random-stats )
(import scheme chicken foreign extras data-structures srfi-4)
(require-extension srfi-4)
;;
;;
;; Chi-Square Test
;;
;; In general, the Chi-Square distribution for an experiment with k
;; possible outcomes, performed n times, in which Y1, Y2,... Yk are
;; the number of experiments which resulted in each possible outcome,
;; and probabilities of each outcome p1, p2,... pk, is given as:
;;
;; \Chi^2 = \sum_{1 <= i <= k}{(Y_i - m * p_i)^2 / np_i}
;;
;; \Chi^2 will grow larger as the measured results diverge from those
;; expected by pure chance. The probability Q that a Chi-Square value
;; calculated for an experiment with d degrees of freedom (where
;; d=k-1, one less the number of possible outcomes) is due to chance
;; is:
;;
;; Q(\Chi^2,d) =
;; [2^{d/2} * \Gamma(d/2)]^{-1} *
;; \int_{\Chi^2}^{\infty}(t)^{d/2-1} * exp(-t/2) * dt
;;
;; Where Gamma is the generalisation of the factorial function to real
;; and complex arguments:
;;
;; Gamma(x) = \int_{0}^{\infty} t^{x-1} * exp(-t) * dt
;;
;;
;; There is no closed form solution for Q, so it must be evaluated
;; numerically. Note that the probability calculated from the \Chi^2
;; is an approximation which is valid only for large values of n, and
;; is therefore only meaningful when calculated from a large number of
;; independent experiments.
;;
;; In this implementation, the Chi-Square distribution is calculated
;; for the list of values given as argument to the random-test
;; procedure and expressed as an absolute number and a percentage
;; which indicates how frequently a truly random sequence would exceed
;; the value calculated.
;;
;; The percentage can be interpreted as the degree to which the
;; sequence tested is suspected of being non-random. If the percentage
;; is greater than 99% or less than 1%, the sequence is almost
;; certainly not random. If the percentage is between 99% and 95% or
;; between 1% and 5%, the sequence is suspect. Percentages between 90%
;; and 95% and 5% and 10% indicate the sequence is "almost
;; suspect".
;;
;;
;; Arithmetic Mean
;;
;; This is simply the result of summing the all the values in the
;; sequence and dividing by the sequence length. If the data are close
;; to random, the mean should be about (2^b - 1)/2 where b is the
;; number of bits used to represent a value. If the mean departs from
;; this value, the values are consistently high or low.
;;
;;
;; Monte Carlo Value for Pi
;;
;; Each pair of two values in the input sequence is used as X and
;; Y coordinates within a square with side N (the length of the input
;; sequence). If the distance of the randomly-generated point is less
;; than the radius of a circle inscribed within the square, the pair
;; of values is considered a "hit". The percentage of hits can be used
;; to calculate the value of Pi:
;;
;;
;; # points within circle 1/4 * pi * r^2
;; ---------------------- = -------------- = 1/4 * pi
;; # points within square r^2
;;
;;
;; Therefore,
;; # points within circle
;; pi = 4 * ----------------------
;; # points within square
;;
;;
;; For very long sequences (this approximation converges very
;; slowly), the value will approach the correct value of Pi if the
;; sequence is close to random.
;;
;;
;; Serial Correlation Coefficient
;;
;; This statistic measures the extent to which each value in the
;; sequence depends upon the previous value. For random sequences,
;; this value (which can be positive or negative) must be close to
;; zero.
;;
;;
(define pochisq (foreign-lambda double "pochisq" double double))
(define log2 (foreign-lambda double "log2" double))
;; Number of bins for Chi-Square test
(define N 1024)
;; Circle radius for the Monte Carlo calculation of Pi
(define R N)
;; Number of coordinates for the Monte Carlo calculation of Pi
(define M 2)
(define PI 3.14159265358979323846)
(define (make-random-test . rest)
(let-optionals rest ((scar car) (scdr cdr) (snull? null?))
(lambda (seq)
;; Bins to count occurrences of values
(define bins (make-u32vector N 0))
;; Monte Carlo coordinates
(define mcoord (make-u32vector M 0))
(let ((nmin 0.0) ;; Minimum and maximum values encountered
(nmax 0.0)
(mp 0) ;; Monte Carlo accumulator pointer
(mcount 0) ;; Monte Carlo tries
(incircle 0)) ;; Monte Carlo circle inside count
(let* ;; Serial correlation terms
((sccn0 (and seq (scar seq)))
(sccn-1 #f)
(scct1 0.0)
(scct2 0.0)
(scct3 0.0))
(let loop ((total 0) (seq seq))
(if (snull? seq)
(random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn-1 )
(let ((n (scar seq)))
;; Updated minimum and maximum
;; Update bin counters
(let ((nbin (inexact->exact (modulo n N))))
(u32vector-set! bins nbin (fx+ 1 (u32vector-ref bins nbin)))
(let ((bi (u32vector-ref bins nbin)))
(if (< bi nmin) (set! nmin bi))
(if (> bi nmax) (set! nmax bi)))
;; Update inside / outside circle counts for Monte Carlo
;; calculation of Pi
(u32vector-set! mcoord mp nbin)
(set! mp (fx+ 1 mp))
(if (fx<= M mp)
(begin (set! mcount (fx+ mcount 1))
(set! mp 0)
(let ((x (u32vector-ref mcoord 0))
(y (u32vector-ref mcoord 1)))
;; determine if the random "coordinates" are
;; inside or outside the circle
(let ((dist (sqrt (+ (expt x 2) (expt y 2)))))
(if (<= dist R) (set! incircle (fx+ 1 incircle)))))))
;; Update calculation of serial correlation coefficient
(if sccn-1 (set! scct1 (+ scct1 (* sccn-1 n))))
(set! scct2 (+ scct2 n))
(set! scct3 (+ scct3 (* n n)))
(set! sccn-1 n)
(loop (fx+ 1 total) (scdr seq)))))))))))
(define (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn)
;; Probabilities per bin for entropy
(define probs (make-f64vector N 0))
;; Compute serial correlation coefficient
(let* ((scct1 (+ scct1 (* sccn sccn0)))
(scct2 (* scct2 scct2))
(sccdenom (- (* total scct3) scct2))
(scc (if (zero? sccdenom)
-10000.0
(/ (- (* total scct1) scct2) sccdenom))))
;; Scan bins and calculate probability for each bin and
;; Chi-Square distribution
(let ((cexp (/ total N))) ;; Expected count per bin
(let-values (((chisq pochisq datasum)
(let loop ((i 0) (chisq 0.0) (datasum 0.0))
(if (fx< i N)
(let ((bi (u32vector-ref bins i)))
(let ((a (- bi cexp)) (p (/ bi total)))
(f64vector-set! probs i p)
(loop (fx+ 1 i)
(+ chisq (/ (* a a) cexp))
(+ datasum (* i bi)))))
(values chisq (pochisq chisq (- total 1)) datasum)))))
;; Calculate entropy
(let ((ent (let loop ((i 0) (ent 0))
(if (fx< i N)
(let ((p (f64vector-ref probs i)))
(loop (fx+ 1 i) (if (< 0.0 p) (+ ent (* p (log2 (/ 1.0 p)))) ent)))
ent))))
;; Calculate Monte Carlo value for Pi from percentage of hits
;; within the circle
(let ((montepi (* 4 (/ incircle mcount))))
`((total . ,total)
(ent . ,ent)
(chisq . ,chisq)
(pochisq . ,pochisq)
(mean . ,(/ datasum total))
(min . ,nmin)
(max . ,nmax)
(montepi . ,montepi)
(scc . ,scc))))))))
(define (format-random-stats out stats)
(fprintf out "Entropy is ~S.\n" (alist-ref 'ent stats))
(fprintf out "Chi-Square distribution for ~S samples is ~S (~S percent random).\n"
(alist-ref 'total stats)
(alist-ref 'chisq stats)
(* 100 (alist-ref 'pochisq stats)))
(fprintf out "Arithmetic mean value of the sequence is ~S (~S = random).\n"
(alist-ref 'mean stats)
(/ (alist-ref 'total stats) N))
(fprintf out "Monte-Carlo value for Pi is ~S (error ~S percent).\n"
(alist-ref 'montepi stats)
(* 100 (/ (abs (- PI (alist-ref 'montepi stats))) PI)))
(fprintf out "Serial correlation coefficient is ~S.\n"
(let ((scc (alist-ref 'scc stats)))
(if (>= scc -99999) scc "undefined (all values equal)"))))
; Include into generated code, but don't parse:
#>
#include
double poz (double z);
#define LOG_SQRT_PI 0.5723649429247000870717135 /* log (sqrt (pi)) */
#define I_SQRT_PI 0.5641895835477562869480795 /* 1 / sqrt (pi) */
#define BIGX 20.0 /* max value to represent exp (x) */
#define ex(x) (((x) < -BIGX) ? 0.0 : exp (x))
/* ALGORITHM Compute probability of chi square value.
;; Adapted from:
;; Hill, I. D. and Pike, M. C. Algorithm 299
;; Collected Algorithms for the CACM 1967 p. 243
;; Updated for rounding errors based on remark in
;; ACM TOMS June 1985, page 185
*/
/*
;; x: obtained chi-square value
;; df: degrees of freedom
*/
double pochisq (double x, double df)
{
double a, y, s;
double e, c, z;
int even; /* true if df is an even number */
if (x <= 0.0 || df < 1)
return (1.0);
a = 0.5 * x;
even = (2*(df/2)) == df;
if (df > 1)
y = ex (-a);
s = (even ? y : (2.0 * poz (-sqrt (x))));
if (df > 2)
{
x = 0.5 * (df - 1.0);
z = (even ? 1.0 : 0.5);
if (a > BIGX)
{
e = (even ? 0.0 : LOG_SQRT_PI);
c = log (a);
while (z <= x)
{
e = log (z) + e;
s += ex (c*z-a-e);
z += 1.0;
}
return (s);
}
else
{
e = (even ? 1.0 : (I_SQRT_PI / sqrt (a)));
c = 0.0;
while (z <= x)
{
e = e * (a / z);
c = c + e;
z += 1.0;
}
return (c * y + s);
}
}
else
return (s);
}
/* Maximum meaningful z value */
#define Z_MAX 6.0
/*
;; FUNCTION poz: probability of normal z value
;;
;; Adapted from a polynomial approximation in:
;; Ibbetson D, Algorithm 209
;; Collected Algorithms of the CACM 1963 p. 616
;; Note:
;;
;; This routine has six digit accuracy, so it is only
;; useful for absolute z values < 6. For z values >= to
;; 6.0, poz() returns 0.0.
;;
*/
double poz (double z)
{
double y, x, w;
if (z == 0.0)
x = 0.0;
else
{
y = 0.5 * fabs (z);
if (y >= (Z_MAX * 0.5))
{
x = 1.0;
}
else if (y < 1.0)
{
w = y * y;
x = ((((((((0.000124818987 * w
-0.001075204047) * w +0.005198775019) * w
-0.019198292004) * w +0.059054035642) * w
-0.151968751364) * w +0.319152932694) * w
-0.531923007300) * w +0.797884560593) * y * 2.0;
}
else
{
y -= 2.0;
x = (((((((((((((-0.000045255659 * y
+0.000152529290) * y -0.000019538132) * y
-0.000676904986) * y +0.001390604284) * y
-0.000794620820) * y -0.002034254874) * y
+0.006549791214) * y -0.010557625006) * y
+0.011630447319) * y -0.009279453341) * y
+0.005353579108) * y -0.002141268741) * y
+0.000535310849) * y +0.999936657524;
}
}
return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5));
}
<#
)