;;
;;
;; Perform some simple randomness tests on a sequence of numerical
;; values.
;;
;; Based on the ENT program by John Walker
;;
;; 
;; Copyright 2007-2012 Ivan Raikov.
;;
;;
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or (at
;; your option) any later version.
;;
;; This program is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.
;;
;; A full copy of the GPL license can be found at
;; <http://www.gnu.org/licenses/>."))))
;;
;;

(module random-test
    
  ( make-random-test format-random-stats )

  (import scheme chicken foreign extras data-structures srfi-4)
  
  (require-extension srfi-4)


;;
;;
;; Chi-Square Test
;;
;;    In general, the Chi-Square distribution for an experiment with k
;; possible outcomes, performed n times, in which Y1, Y2,... Yk are
;; the number of experiments which resulted in each possible outcome,
;; and probabilities of each outcome p1, p2,... pk, is given as:
;;
;;   \Chi^2 = \sum_{1 <= i <= k}{(Y_i - m * p_i)^2 / np_i}
;;
;; \Chi^2 will grow larger as the measured results diverge from those
;; expected by pure chance. The probability Q that a Chi-Square value
;; calculated for an experiment with d degrees of freedom (where
;; d=k-1, one less the number of possible outcomes) is due to chance
;; is:
;;
;;   Q(\Chi^2,d) = 
;;     [2^{d/2} * \Gamma(d/2)]^{-1} *
;;     \int_{\Chi^2}^{\infty}(t)^{d/2-1} * exp(-t/2) * dt
;;
;; Where Gamma is the generalisation of the factorial function to real
;; and complex arguments:
;;
;;   Gamma(x) = \int_{0}^{\infty} t^{x-1} * exp(-t) * dt
;;
;;
;; There is no closed form solution for Q, so it must be evaluated
;; numerically.  Note that the probability calculated from the \Chi^2
;; is an approximation which is valid only for large values of n, and
;; is therefore only meaningful when calculated from a large number of
;; independent experiments.
;;
;; In this implementation, the Chi-Square distribution is calculated
;; for the list of values given as argument to the random-test
;; procedure and expressed as an absolute number and a percentage
;; which indicates how frequently a truly random sequence would exceed
;; the value calculated. 
;; 
;; The percentage can be interpreted as the degree to which the
;; sequence tested is suspected of being non-random. If the percentage
;; is greater than 99% or less than 1%, the sequence is almost
;; certainly not random. If the percentage is between 99% and 95% or
;; between 1% and 5%, the sequence is suspect. Percentages between 90%
;; and 95% and 5% and 10% indicate the sequence is "almost
;; suspect". 
;;
;;
;; Arithmetic Mean
;;
;;     This is simply the result of summing the all the values in the
;; sequence and dividing by the sequence length. If the data are close
;; to random, the mean should be about (2^b - 1)/2 where b is the
;; number of bits used to represent a value. If the mean departs from
;; this value, the values are consistently high or low.
;;
;;
;; Monte Carlo Value for Pi
;; 
;;     Each pair of two values in the input sequence is used as X and
;; Y coordinates within a square with side N (the length of the input
;; sequence). If the distance of the randomly-generated point is less
;; than the radius of a circle inscribed within the square, the pair
;; of values is considered a "hit". The percentage of hits can be used
;; to calculate the value of Pi: 
;;
;;
;;    # points within circle     1/4 * pi * r^2 
;;    ----------------------  =  -------------- = 1/4 * pi
;;    # points within square          r^2
;;
;;
;; Therefore,
;;                   # points within circle
;;          pi = 4 * ----------------------
;;                   # points within square
;;
;; 
;;  For very long sequences (this approximation converges very
;;  slowly), the value will approach the correct value of Pi if the
;;  sequence is close to random.
;;
;;
;; Serial Correlation Coefficient
;;
;;     This statistic measures the extent to which each value in the
;; sequence depends upon the previous value. For random sequences,
;; this value (which can be positive or negative) must be close to
;; zero. 
;;
;;


(define pochisq (foreign-lambda double "pochisq" double double))
(define log2 (foreign-lambda double "log2" double))

;; Number of bins for Chi-Square test
(define N 1024)

;; Circle radius for the Monte Carlo calculation of Pi
(define R N)

;; Number of coordinates for the Monte Carlo calculation of Pi
(define M 2)


(define PI 3.14159265358979323846)

(define (make-random-test . rest)
  (let-optionals rest ((scar car) (scdr cdr) (snull? null?))
   (lambda (seq)

     ;; Bins to count occurrences of values 
     (define bins (make-u32vector N 0))

     ;; Monte Carlo coordinates
     (define mcoord (make-u32vector M 0))
     
     (let ((nmin       0.0)   ;; Minimum and maximum values encountered
	   (nmax       0.0)
	   (mp          0)   ;; Monte Carlo accumulator pointer 
	   (mcount      0)   ;; Monte Carlo tries 
	   (incircle    0))  ;; Monte Carlo circle inside count 
       (let* ;; Serial correlation terms 
	   ((sccn0     (and seq (scar seq)))
	    (sccn-1    #f)    
	    (scct1     0.0)   
	    (scct2     0.0)   
	    (scct3     0.0)) 
	 
	 (let loop ((total 0) (seq seq))
	   (if (snull? seq)
	       (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn-1 )
	       (let ((n  (scar seq)))
		 ;; Updated minimum and maximum
		 ;; Update bin counters
		 (let ((nbin (inexact->exact (modulo n N))))
		   (u32vector-set! bins nbin (fx+ 1 (u32vector-ref bins nbin)))
		   (let ((bi (u32vector-ref bins nbin)))
		     (if (< bi nmin) (set! nmin bi))
		     (if (> bi nmax) (set! nmax bi)))
		   ;; Update inside / outside circle counts for Monte Carlo
		   ;; calculation of Pi 
		   (u32vector-set! mcoord mp nbin)
		   (set! mp (fx+ 1 mp))
		   (if (fx<= M mp)
		      (begin (set! mcount (fx+ mcount 1))
			     (set! mp 0)
			     (let ((x  (u32vector-ref mcoord 0))
				   (y  (u32vector-ref mcoord 1)))
			       ;; determine if the random "coordinates" are
			       ;; inside or outside the circle
			       (let ((dist  (sqrt (+ (expt x 2) (expt y 2)))))
				 (if (<= dist R) (set! incircle (fx+ 1 incircle)))))))
		  ;; Update calculation of serial correlation coefficient 
		   (if sccn-1 (set! scct1 (+ scct1  (* sccn-1 n))))
		   (set! scct2 (+ scct2 n))
		   (set! scct3 (+ scct3 (* n n)))
		   (set! sccn-1 n)
		   (loop (fx+ 1 total) (scdr seq)))))))))))

(define (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn)

  ;; Probabilities per bin for entropy 
  (define probs (make-f64vector N 0))

  ;; Compute serial correlation coefficient
  (let* ((scct1     (+ scct1 (* sccn sccn0)))
	 (scct2     (* scct2 scct2))
	 (sccdenom  (- (* total scct3) scct2))
	 (scc       (if (zero? sccdenom)
			-10000.0
			(/ (- (* total scct1) scct2) sccdenom))))
	 
    ;; Scan bins and calculate probability for each bin and
    ;; Chi-Square distribution 
    (let ((cexp      (/ total N))) ;; Expected count per bin 
      (let-values (((chisq pochisq datasum) 
	  (let loop ((i 0) (chisq 0.0) (datasum 0.0))
	    (if (fx< i N)
		(let ((bi (u32vector-ref bins i)))
		  (let ((a (- bi cexp)) (p (/ bi total)))
		    (f64vector-set! probs i p) 
		    (loop (fx+ 1 i) 
			  (+ chisq (/ (* a a) cexp))
			  (+ datasum (* i bi)))))
		(values chisq (pochisq chisq (- total 1)) datasum)))))

       ;; Calculate entropy 
       (let ((ent  (let loop ((i 0) (ent 0))
		     (if (fx< i N)
			 (let ((p (f64vector-ref probs i)))
			   (loop (fx+ 1 i) (if (< 0.0 p) (+ ent (* p (log2 (/ 1.0 p)))) ent)))
			 ent))))

	 ;; Calculate Monte Carlo value for Pi from percentage of hits
	 ;; within the circle
	 (let ((montepi (* 4 (/ incircle mcount))))
	   `((total    . ,total)
	     (ent      . ,ent)
	     (chisq    . ,chisq)
	     (pochisq  . ,pochisq)
	     (mean     . ,(/ datasum total))
	     (min      . ,nmin)
	     (max      . ,nmax)
	     (montepi  . ,montepi)
	     (scc      . ,scc))))))))

(define (format-random-stats out stats)
  (fprintf out "Entropy is ~S.\n" (alist-ref 'ent stats))
  (fprintf out "Chi-Square distribution for ~S samples is ~S (~S percent random).\n"
	   (alist-ref 'total stats) 
	   (alist-ref 'chisq stats) 
	   (* 100 (alist-ref 'pochisq stats)))
  (fprintf out "Arithmetic mean value of the sequence is ~S (~S = random).\n"
	   (alist-ref 'mean stats)
	   (/ (alist-ref 'total stats) N))
  (fprintf out "Monte-Carlo value for Pi is ~S (error ~S percent).\n"
	   (alist-ref 'montepi stats)
	   (* 100 (/ (abs (- PI (alist-ref 'montepi stats))) PI)))
  (fprintf out "Serial correlation coefficient is ~S.\n"
	   (let ((scc (alist-ref 'scc stats)))
	     (if (>= scc -99999) scc "undefined (all values equal)"))))


; Include into generated code, but don't parse:
#>

#include <math.h>
double poz  (double z);

#define	LOG_SQRT_PI     0.5723649429247000870717135 /* log (sqrt (pi)) */
#define	I_SQRT_PI       0.5641895835477562869480795 /* 1 / sqrt (pi) */
#define	BIGX           20.0         /* max value to represent exp (x) */
#define	ex(x)          (((x) < -BIGX) ? 0.0 : exp (x))


/* ALGORITHM Compute probability of chi square value.
;;	Adapted from:
;;		Hill, I. D. and Pike, M. C.  Algorithm 299
;;		Collected Algorithms for the CACM 1967 p. 243
;;	Updated for rounding errors based on remark in
;;		ACM TOMS June 1985, page 185
*/

/* 
;;  x: obtained chi-square value 
;;  df: degrees of freedom 
*/

double pochisq (double x, double df)
{

   double a, y, s;
   double e, c, z;
   int 	even;     /* true if df is an even number */
	
   if (x <= 0.0 || df < 1)
     return (1.0);
	
     a = 0.5 * x;
     even = (2*(df/2)) == df;
     if (df > 1)
     y = ex (-a);
     s = (even ? y : (2.0 * poz (-sqrt (x))));
     if (df > 2)
     {
      x = 0.5 * (df - 1.0);
	z = (even ? 1.0 : 0.5);
	if (a > BIGX)
	{
	 e = (even ? 0.0 : LOG_SQRT_PI);
	   c = log (a);
	   while (z <= x)
	   {
	    e = log (z) + e;
	      s += ex (c*z-a-e);
	      z += 1.0;
	      }
	   return (s);
	   }
	else
	{
	 e = (even ? 1.0 : (I_SQRT_PI / sqrt (a)));
	   c = 0.0;
	   while (z <= x)
	   {
	    e = e * (a / z);
	      c = c + e;
	      z += 1.0;
	      }
	   return (c * y + s);
	   }
	}
     else
     return (s);
}

/* Maximum meaningful z value */
#define Z_MAX 6.0    


/*
;; FUNCTION poz: probability of normal z value 
;;
;;	Adapted from a polynomial approximation in:
;;		Ibbetson D, Algorithm 209
;;		Collected Algorithms of the CACM 1963 p. 616
;;	Note:
;;
;;		This routine has six digit accuracy, so it is only
;;		useful for absolute z values < 6.  For z values >= to
;;		6.0, poz() returns 0.0.
;;
*/
double poz (double z)
{

 double	y, x, w;
	
 if (z == 0.0)
    x = 0.0;
 else
 {
    y = 0.5 * fabs (z);
    if (y >= (Z_MAX * 0.5))
    {
       x = 1.0;
    }
    else if (y < 1.0)
         {
	    w = y * y;
	    x = ((((((((0.000124818987 * w
		       -0.001075204047) * w +0.005198775019) * w
		       -0.019198292004) * w +0.059054035642) * w
		       -0.151968751364) * w +0.319152932694) * w
		       -0.531923007300) * w +0.797884560593) * y * 2.0;
	  }
 	  else
	  {
	     y -= 2.0;
	       x = (((((((((((((-0.000045255659 * y
				+0.000152529290) * y -0.000019538132) * y
				-0.000676904986) * y +0.001390604284) * y
				-0.000794620820) * y -0.002034254874) * y
				+0.006549791214) * y -0.010557625006) * y
				+0.011630447319) * y -0.009279453341) * y
				+0.005353579108) * y -0.002141268741) * y
				+0.000535310849) * y +0.999936657524;
	  }
 }

 return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5));
}

<#
)