;;;;

;;Issues
;;
;;- see the haskell benchmark suite (? name ?)
;;
;;- missing `nice` argument checking

(module micro-stats

(;export
  ;
  statistics-sets statistics-set
  basic-statistics
  generate-statistics generate-statistics-alist
  ;
  standard-deviation*
  variance standard-deviation absolute-deviation
  mean*
  arithmetic-mean harmonic-mean geometric-mean
  mean median mode
  quantile percentile interquartile-range five-number-summary
  ;#| ;FIXME need to validate
  kurtosis skewness covariance correlation
  chi-square
  ;|#
  )

(import scheme
  (chicken base)
  (chicken type)
  (chicken syntax)
  (chicken fixnum)
  (chicken flonum)
  (chicken foreign)
  (only (chicken format) format)
  (only (srfi 1) first list-copy every fold)
  (prefix sequences seq:)
  (prefix (sequences utils misc) seq:)
  (prefix (sequences utils sort) seq:)
  (prefix (sequences utils sample) seq:))

(include-relative "micro-stats-types")

(define-type basic-stats-vector (vector real real real real real))

(: statistics-sets (#!optional alist -> alist))
(: statistics-set (stats-set-id -> alist))
(: basic-statistics (seq -> basic-stats-vector))

(: generate-statistics (seq #!optional (or boolean symbol) -> (or false stats-alist)))
(: generate-statistics-alist (seq (or false list) #!optional basic-stats-vector -> (or false stats-alist)))

(: variance (seq #!optional (or boolean integer) -> real))
(: standard-deviation* (seq #!optional (or boolean integer) -> real real))
(: standard-deviation (seq #!optional (or boolean integer) -> real))
(: absolute-deviation (seq -> real))

(: mean* (seq -> real real real))
(: arithmetic-mean (seq -> real))
(: harmonic-mean (seq -> real))
(: geometric-mean (seq -> real))
(: mean (seq -> real))
(: mode (seq -> real))
(: median (seq -> real))

(: quantile (seq real -> real))
(: percentile (seq #!optional real -> real))
(: interquartile-range (seq -> real))
(: five-number-summary (seq -> real real real real real))

(: kurtosis (seq #!optional (or boolean integer) -> real))
(: skewness (seq #!optional (or boolean integer) -> real))

(: covariance (seq seq #!optional (or boolean integer) -> real))
(: correlation (seq seq #!optional (or boolean integer) -> real))

(: chi-square (seq (or real seq) #!optional boolean -> real))

;;;

;from miscmacros

(define-syntax define-parameter
  (syntax-rules ()
    ((define-parameter name value guard)
     (define name (make-parameter value guard)))
    ((define-parameter name value)
     (define name (make-parameter value)))
    ((define-parameter name)
     (define name (make-parameter (void))))))

;from moremacros

(import-for-syntax (only (chicken base) symbol-append))

(define-syntax checked-guard
  (er-macro-transformer
    (lambda (frm rnm cmp)
      (##sys#check-syntax 'checked-guard frm '(_ symbol symbol . _))
      (let ((_lambda (rnm 'lambda))
            (_let (rnm 'let))
            (_arg (rnm 'arg))
            (?locnam (cadr frm))
            (?typnam (caddr frm))
            (?body (cdddr frm)) )
        (let ((chknam (symbol-append 'check- (strip-syntax ?typnam)))) ;inject
          `(,_lambda (,_arg)
            (,chknam ',?locnam ,_arg)
            (,_let ((obj ,_arg))
              ,@?body
              obj ) ) ) ) ) ) )

(define-syntax define-checked-parameter
  (syntax-rules ()
    ((define-checked-parameter ?name ?init ?typnam ?body0 ...)
      (define-parameter ?name ?init (checked-guard ?name ?typnam ?body0 ...)) ) ) )

;from check-errors

(define (make-bad-argument-type-message #!optional argnam)
  (if (not argnam) "bad argument type"
      (format #f "bad `~A' argument type" argnam)) )

(define (make-type-name-tobe-message typnam)
  (format #f "not a ~A" typnam) )

(define (make-error-type-message typnam #!optional argnam)
  (string-append (make-bad-argument-type-message argnam)
                 " - "
                 (make-type-name-tobe-message typnam)) )

(define (alist? obj) (and (list? obj) (every pair? obj)))

(define (error-alist loc obj #!optional argnam)
  (import (only (chicken string) conc))
  (##sys#signal-hook #:type-error loc (make-error-type-message "alist" argnam) obj) )

(define (check-alist loc obj . args)
  (unless (alist? obj)
    (error-alist loc obj (let ((tmp args)) (if (null? tmp) #f (car tmp)))) )
  obj )

;;;

;;

;let actual float in data taint the result
(define VERY-SMALL-EXACT (inexact->exact minimum-flonum))
(define VERY-LARGE-EXACT (inexact->exact maximum-flonum))

(define NORMAL-STATISTICS '(
  size
  max min
  arithmetic-mean (mean . (arithmetic-mean))
  standard-deviation (sd . (standard-deviation)) (sigma . (standard-deviation))
))

(define VERBOSE-STATISTICS (append NORMAL-STATISTICS '(
  (95th . (percentile 95))
  harmonic-mean geometric-mean
  median mode
  variance (var . (variance)) (sigma^2 . (variance))
)))

(define FULL-STATISTICS (append VERBOSE-STATISTICS '(
  kurtosis
  skewness
  interquartile-range
  five-number-summary
)))

(define INITIAL-STATISTICS-SETS `(
  (normal . ,NORMAL-STATISTICS)
  (verbose . ,VERBOSE-STATISTICS)
  (full . ,FULL-STATISTICS)
))

(define-checked-parameter statistics-sets INITIAL-STATISTICS-SETS alist)

(define (checked-statistics-set-id->symbol obj)
  (case obj
    ((#f) 'normal)
    ((#t) 'verbose)
    (else
      (if (memq obj '(normal verbose full)) obj
          (error 'statistics-set "not a valid stat-set-id" obj)))) )

(define (statistics-set stat-set-id)
  (let ((key (checked-statistics-set-id->symbol stat-set-id)))
     (alist-ref key (statistics-sets) eq?) ) )

(define-constant ARITHMETIC-IDX 0)
(define-constant HARMONIC-IDX 1)
(define-constant GEOMETRIC-IDX 2)

(define-constant SIGMA-IDX 3)
(define-constant SIGMA^2-IDX 4)

(define (normal-stats runs)
  (vector (arithmetic-mean runs) +nan.0 +nan.0 (standard-deviation runs) +nan.0) )

(define (verbose-stats runs)
  (let*-values (((m h g) (mean* runs))
                ((s v) (standard-deviation* runs m)) )
    (vector m h g s v) ) )

(define (needs-verbose-stats?  funcs)
  (define (set-has-verbose-item! flags func)
    (let* ((parameterized? (pair? func))
           (func-key (if parameterized? (car func) func))
           (func-name (if parameterized? (cadr func) func-key)) )
      (case func-name
        ((harmonic-mean)  (vector-set! flags 0 #t))
        ((geometric-mean) (vector-set! flags 1 #t))
        ((variance)       (vector-set! flags 2 #t)) )
      flags ) )
  (let ((flags (foldl set-has-verbose-item! (vector #f #f #f) funcs)))
    (and (vector-ref flags 0)
         (vector-ref flags 1)
         (vector-ref flags 2)) ) )

(define (basic-stats runs funcs)
  ((if (needs-verbose-stats? funcs) verbose-stats normal-stats) runs) )

(define (gen-stats-item runs func statvec)
  (let* ((parameterized? (pair? func))
         (func-key (if parameterized? (car func) func))
         (func-name (if parameterized? (cadr func) func-key))
         (func-params (if parameterized? (cddr func) '())) )
    `(,func-key
      .
      ,(case func-name
        ((size)
          (seq:size runs) )
        ((max)
          (seq:foldl max VERY-SMALL-EXACT runs) )
        ((min)
          (seq:foldl min VERY-LARGE-EXACT runs) )
        ((arithmetic-mean)
          (vector-ref statvec ARITHMETIC-IDX) )
        ((standard-deviation)
          (vector-ref statvec SIGMA-IDX) )
        ((percentile)
          (apply percentile runs func-params) )
        ((harmonic-mean)
          (vector-ref statvec HARMONIC-IDX) )
        ((geometric-mean)
          (vector-ref statvec GEOMETRIC-IDX) )
        ((median)
          (median runs) )
        ((mode)
          (mode runs) )
        ((variance)
          (vector-ref statvec SIGMA^2-IDX) )
        (else
          (error 'statistic "unknown function" func-name)))) ) )

;

(define basic-statistics verbose-stats)

(define (generate-statistics runs #!optional stat-set-id)
  (generate-statistics-alist runs (statistics-set stat-set-id)) )

(define (generate-statistics-alist runs funcs #!optional statvec)
  (and (list? funcs)
       (map (cut gen-stats-item runs <> (or statvec (basic-stats runs funcs)))
            funcs) ) )

;;

(define (acc-update! acc i f) (set! (vector-ref acc i) (f (vector-ref acc i))))

(define (geometric-accum n)
  (let ((r (/ 1 n)))
    (lambda (acc run)
      (let ((x (expt run r)))
        (* acc (if (exact? run) (inexact->exact x) x)) ) ) ) )

(define ((harmonic-accum n) acc run)
  (if (zero? run) acc
      (+ acc (/ 1 run))) )

(define ((arithmetic-accum n) acc run)
  (+ acc (/ run n)) )

;seq => arithmetic harmonic geometric
(define (mean* runs)
  (let ((n (seq:size runs)))
    (if (fx= 0 n) (values +nan.0 +nan.0 +nan.0)
        (let* ((accums (vector 0 0 1))
               (arit-acc (arithmetic-accum n))
               (harm-acc (harmonic-accum n))
               (geom-acc (geometric-accum n))
               (accum!
                 (lambda (accs run)
                   ;minimize the acccumulated value
                   (acc-update! accs ARITHMETIC-IDX (cut arit-acc <> run))
                   (acc-update! accs HARMONIC-IDX (cut harm-acc <> run))
                   (acc-update! accs GEOMETRIC-IDX (cut geom-acc <> run))
                   accs ) )
               (accums (seq:foldl accum! accums runs)) )
            (values (vector-ref accums ARITHMETIC-IDX)
                    (let ((h (vector-ref accums HARMONIC-IDX)))
                      (if (zero? h) h (/ n h)))
                    (vector-ref accums GEOMETRIC-IDX) ) ) ) ) )

;Returns the arithmetic mean of a set of numbers @runs. @runs is a
;sequence of numeric values.
(define (arithmetic-mean runs)
  (seq:foldl (arithmetic-accum (seq:size runs)) 0 runs) )

(define (harmonic-mean runs)
  (let* ((n (seq:size runs))
         (h (seq:foldl (harmonic-accum n) 0 runs)) )
    (if (zero? h) h (/ n h)) ) )

(define (geometric-mean runs)
  (seq:foldl (geometric-accum (seq:size runs)) 1 runs) )

(define mean arithmetic-mean)

;Returns the median for a set of numbers @runs.
(define (median runs)
  (if (seq:empty? runs) +nan.0
      (let* ((ordered (seq:sort runs <))
             (mid (- (/ (fx+ (seq:size ordered) 1) 2) 1)) )
        (if (integer? mid) (seq:elt ordered mid))
            (/ (+ (seq:elt ordered (floor mid)) (seq:elt ordered (ceiling mid))) 2) ) ) )

;Returns the mode of a set of numbers @runs. The mode is the value that
;appears most often in @runs. @runs is a sequence of numeric values.
;{=} is used as the equality operator.
(define (mode runs)
  (define (occur> a b) (> (cdr a) (cdr b)))
  (if (seq:empty? runs) +nan.0
      (let* ((srtd (seq:sort! (seq:histogram runs =) occur>))
             (top-bin (seq:elt srtd 0))
             (btm-bin (seq:elt srtd (fx- (seq:size srtd) 1))) )
        ;when no distinction then no mode: go w/ the bottom
        (if (fx= (cdr top-bin) (cdr btm-bin)) (car btm-bin)
            (car top-bin)) ) ) )
;;

(define (*quantile runs p)
  (if (seq:empty? runs) +nan.0
      ;ceiling np must be valid index, so last index, not size
      (let ((np (* p (fx- (seq:size runs) 1)))
            (ordered (seq:sort runs <)) )
        (arithmetic-mean
          (seq:sequence runs (seq:elt ordered (inexact->exact (floor np)))
                             (seq:elt ordered (inexact->exact (ceiling np))))) ) ) )

;Returns the p-quantile for a set of numbers @runs (also known as the
;"inverse cumulative distribution""). @p is a real number between 0 and
;1.0. For instance, the 0.5-quantile corresponds to the median.
(define (quantile runs p)
  (assert (and (real? p) (<= 0 p 1)) 'quantile "not in [0 .. 1]" p)
  (*quantile runs p) )

;Returns the percentile for a set of numbers @runs and a given percentage
;@pct. @pct is a number between 0 and 100. For instance, the 90th
;percentile corresponds to the 0.9-quantile.
(define (percentile runs #!optional (pct 95))
  (assert (and (real? pct) (<= 0 pct 100)) 'percentile "not in [0 .. 100]" pct)
  (*quantile runs (/ pct 100)) )

;Returns the interquartile range for a given set of numbers @runs. @runs
;is a proper list of numeric values. The interquartile range is the
;difference between the 0.75-quantile and the 0.25-quantile.
(define (interquartile-range runs) (- (*quantile runs 3/4) (*quantile runs 1/4)))

;Returns a list of 5 statistics describing the set of numbers @runs: the
;minimum value, the lower quartile, the median, the upper quartile, and
;the maximum value.
(define (five-number-summary runs)
  (values (*quantile runs 0)
          (*quantile runs 1/4) (*quantile runs 1/2) (*quantile runs 3/4)
          (*quantile runs 1)) )

;;

;NOTE About Data Input Form
;
;All @runs* are a {sequence} of numbers.

;NOTE About Bias Correction
;
;Bias correction gets disabled by setting @bias to {#f}. Alternatively,
;it is possible to provide a positive integer, which is used instead of
;the number of elements.

;Returns the variance for a set of numbers @runs, optionally modifying
;the bias correction.
(define (variance runs #!optional (bias #t))
  (let ((siz (seq:size runs)))
    (if (<= 0 siz 1) 0
        (let* ((m (arithmetic-mean runs))
               (acc (seq:foldl (lambda (acc elm) (+ acc (expt (- elm m) 2))) 0 runs)) )
          (/ acc
             (if (integer? bias) bias
                 (- siz (if bias 1 0)))) ) ) ) )

;seq => sd var
(define (standard-deviation* runs #!optional (bias #t))
  (let* ((var (variance runs bias))
         (sd (sqrt var)) )
    (values (if (exact? var) (inexact->exact sd) sd) var)) )

;Returns the standard deviation for a set of numbers @runs, optionally
;modifying the bias correction.
(define (standard-deviation runs #!optional (bias #t))
  (receive (sd var) (standard-deviation* runs bias) sd) )

;Returns the average absolute difference between the set of numbers
;@runs and it's median.
(define (absolute-deviation runs)
  (let ((siz (seq:size runs)))
    (if (zero? siz) +nan.0
        (let ((m (arithmetic-mean runs)))
          (max 0 (/ (seq:foldl (lambda (acc elm) (+ acc (abs (- elm m)))) runs) siz)))) ) )

;;

;Returns the skewness for a set of numbers @runs, optionally modifying
;the bias correction.
(define (skewness runs #!optional (bias #t))
  (let ((n (seq:size runs))
        (m (arithmetic-mean runs))
        (s (standard-deviation runs bias)) )
    (* (/ (seq:foldl (lambda (z x) (+ z (/ (expt (- x m) 3) n))) 0 runs)
          (expt s 3))
       (if bias
           (let ((n (if (integer? bias) bias n)))
             (/ (sqrt (* n (- n 1))) (- n 2)))
           1)) ) )

;Returns the kurtosis for a set of numbers @runs, optionally modifying
;the bias correction.
(define (kurtosis runs #!optional (bias #t))
  (let ((n (seq:size runs))
        (m (arithmetic-mean runs))
        (s (standard-deviation runs bias)))
    (/ (seq:foldl (lambda (z x) (+ z (expt (- x m) 4))) 0 runs)
       (* (expt s 4)
          (if (integer? bias)
              bias
              (if bias (- n 1) n)))) ) )

;;

;Returns the covariance of two sets of numbers @runs1 and @runs2,
;optionally modifying the bias correction.
(define (covariance runs1 runs2 #!optional (bias #t))
  (let ((n (seq:size runs1)))
    (assert (= n (seq:size runs2))
            'covariance "runs must be same size" n (seq:size runs2))
    (let ((xm (arithmetic-mean runs1))
          (ym (arithmetic-mean runs2)))
      ;FIXME need 2 seq foldl, coercion expensive
      (/ (fold (lambda (x y z) (+ z (* (- x xm) (- y ym))))
               0 (seq:->list runs1) (seq:->list runs2))
         (if (integer? bias)
             bias
             (- n (if bias 1 0)))) ) ) )

;Returns the correlation of two sets of numbers, @runs1 and @runs2, in
;the form of the Pearson product-moment correlation coefficient,
;optionally modifying the bias correction.
(define (correlation runs1 runs2 #!optional (bias #t))
  (/ (covariance runs1 runs2 bias)
     (* (standard-deviation runs1 bias) (standard-deviation runs2 bias))) )

;;

(define (yates-chi-square-component o e)
  (let ((r (- (abs (- o e)) 1/2)))
    (/ (* r r) e) ) )

(define (pearson-chi-square-component o e)
  (let ((r (- o e)))
    (/ (* r r) e) ) )

(define (chi-components component observed expected)
  (define (xpct-elt-comp s i)
    (component (seq:elt s i) (seq:elt expected (seq:index i))) )
  (if (seq:sequence? expected)
      (seq:smap* #() xpct-elt-comp observed)
      (seq:smap #() (cut component <> expected) observed) ) )

;

;Returns the chi-square of two sets of numbers, @observed and @expected,
;using the Pearson component. The Yates's correction component gets used
;by setting @yates? to {#t}.
(define (chi-square observed expected #!optional yates?)
  ;
  (assert (seq:sequence? observed)
          'chi-square "observed not a sequence" observed expected yates?)
  (assert (or (seq:sequence? expected) (real? expected))
          'chi-square "expected not a sequence or real" observed expected yates?)
  (when (and (seq:sequence? observed) (seq:sequence? expected))
    (assert (= (seq:size expected) (seq:size observed))
            'chi-square "length observed not same length expected" observed expected yates?) )
  ;
  (let ((component (if yates? yates-chi-square-component pearson-chi-square-component)))
    (seq:reduce + 0 (chi-components component observed expected)) ) )

) ;module micro-stats