;;;;

;;Issues
;;
;;- see the haskell benchmark suite (? name ?)
;;
;;- missing `nice` argument checking

(module micro-stats

(;export
  statistics-sets statistics-set
  basic-statistics
  generate-statistics generate-statistics-alist
  variance standard-deviation absolute-deviation
  arithmetic-mean mean median mode
  quantile percentile interquartile-range five-number-summary
  kurtosis skewness covariance correlation
  ;#; ;need to validate
  chi-square)

(import scheme
  (chicken base)
  (chicken type)
  (chicken syntax)
  (chicken fixnum)
  (chicken flonum)
  (chicken foreign)
  (only (chicken format) format)
  (only (srfi 1) first list-copy every fold)
  (prefix sequences seq:)
  (prefix (sequences utils misc) seq:)
  (prefix (sequences utils sort) seq:)
  (prefix (sequences utils sample) seq:))

(include-relative "micro-stats-types")

(: statistics-sets (#!optional alist -> alist))
(: statistics-set (stats-set-id -> alist))
(: basic-statistics (seq -> (vector-of real)))

(: generate-statistics (seq #!optional (or boolean symbol) -> (or false stats-alist)))
(: generate-statistics-alist (seq (or false list) #!optional (vector-of real) -> (or false stats-alist)))
(: generate-statistics-sublist (seq list #!optional vector -> (pair symbol *)))

(: variance (seq #!optional (or boolean real) -> real))
(: standard-deviation (seq #!optional (or boolean real) -> real real))
(: stddev (seq #!optional (or boolean real) -> real))
(: absolute-deviation (seq -> real))

(: arithmetic-mean (seq -> real))
(: mean (seq -> real real real))
(: mode (seq -> real))
(: median (seq -> real))

(: quantile (seq real -> real))
(: percentile (seq #!optional real -> real))
(: interquartile-range (seq -> real))
(: five-number-summary (seq -> real real real real real))

(: kurtosis (seq #!optional (or boolean real) -> real))
(: skewness (seq #!optional (or boolean real) -> real))

(: covariance (seq seq #!optional (or boolean real) -> real))
(: correlation (seq seq #!optional (or boolean real) -> real))

(: chi-square (seq (or real seq) #!optional boolean -> real))

;;;

;from miscmacros

(define-syntax define-parameter
  (syntax-rules ()
    ((define-parameter name value guard)
     (define name (make-parameter value guard)))
    ((define-parameter name value)
     (define name (make-parameter value)))
    ((define-parameter name)
     (define name (make-parameter (void))))))

;from moremacros

(import-for-syntax (only (chicken base) symbol-append))

(define-syntax checked-guard
  (er-macro-transformer
    (lambda (frm rnm cmp)
      (##sys#check-syntax 'checked-guard frm '(_ symbol symbol . _))
      (let ((_lambda (rnm 'lambda))
            (_let (rnm 'let))
            (_arg (rnm 'arg))
            (?locnam (cadr frm))
            (?typnam (caddr frm))
            (?body (cdddr frm)) )
        (let ((chknam (symbol-append 'check- (strip-syntax ?typnam)))) ;inject
          `(,_lambda (,_arg)
            (,chknam ',?locnam ,_arg)
            (,_let ((obj ,_arg))
              ,@?body
              obj ) ) ) ) ) ) )

(define-syntax define-checked-parameter
  (syntax-rules ()
    ((define-checked-parameter ?name ?init ?typnam ?body0 ...)
      (define-parameter ?name ?init (checked-guard ?name ?typnam ?body0 ...)) ) ) )

;from check-errors

(define (make-bad-argument-type-message #!optional argnam)
  (if (not argnam) "bad argument type"
      (format #f "bad `~A' argument type" argnam)) )

(define (make-type-name-tobe-message typnam)
  (format #f "not a ~A" typnam) )

(define (make-error-type-message typnam #!optional argnam)
  (string-append (make-bad-argument-type-message argnam)
                 " - "
                 (make-type-name-tobe-message typnam)) )

(define (alist? obj) (and (list? obj) (every pair? obj)))

(define (error-alist loc obj #!optional argnam)
  (import (only (chicken string) conc))
  (##sys#signal-hook #:type-error loc (make-error-type-message "alist" argnam) obj) )

(define (check-alist loc obj . args)
  (unless (alist? obj)
    (error-alist loc obj (let ((tmp args)) (if (null? tmp) #f (car tmp)))) )
  obj )

;;;

;;

;let actual float in data taint the result
(define VERY-SMALL-EXACT (inexact->exact minimum-flonum))
(define VERY-LARGE-EXACT (inexact->exact maximum-flonum))

(define NORMAL-STATISTICS '(
  size
  max min
  arithmetic-mean (mean . (arithmetic-mean))
  standard-deviation (sd . (standard-deviation)) (sigma . (standard-deviation))
))

(define VERBOSE-STATISTICS (append NORMAL-STATISTICS '(
  (95th . (percentile 95))
  harmonic-mean geometric-mean
  median mode
  variance (var . (variance)) (sigma^2 . (variance))
)))

(define FULL-STATISTICS (append VERBOSE-STATISTICS '(
  kurtosis
  skewness
  interquartile-range
  five-number-summary
)))

(define INITIAL-STATISTICS-SETS `(
  (normal . ,NORMAL-STATISTICS)
  (verbose . ,VERBOSE-STATISTICS)
  (full . ,FULL-STATISTICS)
))

(define-checked-parameter statistics-sets INITIAL-STATISTICS-SETS alist)

(define (checked-statistics-set-id->symbol obj)
  (case obj
    ((#f) 'normal)
    ((#t) 'verbose)
    (else
      (if (symbol? obj) obj
          (error 'statistics-set "not a valid stat-set-id" obj)))) )

(define (statistics-set stat-set-id)
  (let ((key (checked-statistics-set-id->symbol stat-set-id)))
     (alist-ref key (statistics-sets) eq?) ) )

;;

(define-constant ARITHMETIC-IDX 0)
(define-constant HARMONIC-IDX 1)
(define-constant GEOMETRIC-IDX 2)

(define-constant SIGMA-IDX 3)
(define-constant SIGMA^2-IDX 4)

;;

(define (basic-statistics runs)
  (let*-values (((m h g) (mean runs))
                ((s v) (standard-deviation runs m)) )
    (vector m h g s v) ) )

;;

(define (generate-statistics runs #!optional stat-set-id)
  (let ((basics (basic-statistics runs))
        (funcs (statistics-set stat-set-id)) )
    (generate-statistics-alist runs funcs basics) ) )

(define (generate-statistics-alist runs funcs #!optional (basics (basic-statistics runs)))
  (and (list? funcs)
       (map (cut generate-statistics-sublist runs <> basics) funcs) ) )

(define (generate-statistics-sublist runs func #!optional (basics (basic-statistics runs)))
  (let* ((parameterized? (pair? func))
         (func-key (if parameterized? (car func) func))
         (func-name (if parameterized? (cadr func) func-key))
         (func-params (if parameterized? (cddr func) '())) )
    `(,func-key
      .
      ,(case func-name
        ((size)
          (seq:size runs) )
        ((max)
          (seq:foldl max VERY-SMALL-EXACT runs) )
        ((min)
          (seq:foldl min VERY-LARGE-EXACT runs) )
        ((arithmetic-mean)
          (vector-ref basics ARITHMETIC-IDX) )
        ((standard-deviation)
          (vector-ref basics SIGMA-IDX) )
        ((percentile)
          (apply percentile runs func-params) )
        ((harmonic-mean)
          (vector-ref basics HARMONIC-IDX) )
        ((geometric-mean)
          (vector-ref basics GEOMETRIC-IDX) )
        ((median)
          (median runs) )
        ((mode)
          (mode runs) )
        ((variance)
          (vector-ref basics SIGMA^2-IDX) )
        (else
          (error 'statistic "unknown function" func-name)))) ) )

;;

(define (acc-update! acc i f) (set! (vector-ref acc i) (f (vector-ref acc i))))

;seq => arithmetic harmonic geometric
(define (mean runs)
  (let ((size (seq:size runs)))
    (if (fx= 0 size) (values +nan.0 +nan.0 +nan.0)
        (let* ((n size)
               (r (/ 1 n))
               (acc (vector 0 0 1))
               (accum!
                 (lambda (acc value)
                   ;minimize the acccumulated value
                   (acc-update! acc GEOMETRIC-IDX
                     (lambda (run)
                       (let ((x (expt value r)))
                         (* run (if (exact? value) (inexact->exact x) x)))))
                   (acc-update! acc HARMONIC-IDX
                     (lambda (run) (if (zero? value) run (+ run (/ 1 value)))))
                   (acc-update! acc ARITHMETIC-IDX
                     (lambda (run) (+ run (/ value n))))
                   acc ) )
               (acc (seq:foldl accum! acc runs)) )
            (values (vector-ref acc ARITHMETIC-IDX)
                    (let ((h (vector-ref acc HARMONIC-IDX)))
                      (if (zero? h) h (/ n h)))
                    (vector-ref acc GEOMETRIC-IDX) ) ) ) ) )

;;

(define (median runs)
  (if (seq:empty? runs) +nan.0
      (let* ((ordered (seq:sort runs <))
             (mid (- (/ (fx+ (seq:size ordered) 1) 2) 1)) )
        (if (integer? mid) (seq:elt ordered mid))
            (/ (+ (seq:elt ordered (floor mid)) (seq:elt ordered (ceiling mid))) 2) ) ) )

;;

(define (mode runs)
  (define (occur> a b) (> (cdr a) (cdr b)))
  (if (seq:empty? runs) +nan.0
      (let* ((srtd (seq:sort! (seq:histogram runs =) occur>))
             (top-bin (seq:elt srtd 0))
             (btm-bin (seq:elt srtd (fx- (seq:size srtd) 1))) )
        ;when no distinction then no mode: go w/ the bottom
        (if (fx= (cdr top-bin) (cdr btm-bin)) (car btm-bin)
            (car top-bin)) ) ) )

;;

;seq => arithmetic harmonic geometric
(define (arithmetic-mean runs) (receive (m h g) (mean runs) m))

;;

;; Computes the p-quantile for a set of numbers @runs (also known as the
;; "inverse cumulative distribution""). `p` is a real number between 0 and
;; 1.0. For instance, the 0.5-quantile corresponds to the median.
(define (*quantile runs p)
  (if (seq:empty? runs) +nan.0
      ;ceiling np must be valid index, so last index, not size
      (let ((np (* p (fx- (seq:size runs) 1)))
            (ordered (seq:sort runs <)) )
        (arithmetic-mean
          (seq:sequence runs (seq:elt ordered (inexact->exact (floor np)))
                             (seq:elt ordered (inexact->exact (ceiling np))))) ) ) )

(define (quantile runs p)
  (assert (and (real? p) (<= 0 p 1)) 'quantile "not in [0 .. 1]" p)
  (*quantile runs p) )

(define (percentile runs #!optional (p 95))
  (assert (and (real? p) (<= 0 p 100)) 'percentile "not in [0 .. 100]" p)
  (*quantile runs (/ p 100)) )

;seq => Q(.75) - Q(.25)
(define (interquartile-range runs) (- (*quantile runs 3/4) (*quantile runs 1/4)))

;seq => Q(0.0) Q(.25) Q(.50) Q(.75) Q(1.0)
(define (five-number-summary runs)
  (values (*quantile runs 0)
          (*quantile runs 1/4) (*quantile runs 1/2) (*quantile runs 3/4)
          (*quantile runs 1)) )

;;

;seq => sigma sigma^2
(define (standard-deviation runs #!optional (bias #t))
  (let* ((var (variance runs bias))
         (sd (sqrt var)) )
    (values (if (exact? var) (inexact->exact sd) sd) var)) )

(define (stddev runs #!optional (bias #t))
  (receive (sd var) (standard-deviation runs bias) sd) )

;; Computes the average absolute difference between the numbers in list @runs and
;; (median @runs).
(define (absolute-deviation runs)
  (let ((siz (seq:size runs)))
    (if (zero? siz) +nan.0
        (let ((m (arithmetic-mean runs)))
          (max 0 (/ (seq:foldl (lambda (acc elm) (+ acc (abs (- elm m)))) runs) siz)))) ) )

;; Computes the variance for a set of numbers @runs, optionally applying
;; bias correction. @runs is a proper list of numeric values. Bias correction gets
;; enabled by setting `bias` to `#t`. Alternatively, it is possible to provide a
;; a positive integer, which is used instead of the number of elements in @runs.
(define (variance runs #!optional (bias #t))
  (let ((m (arithmetic-mean runs))
        (siz (seq:size runs)) )
    (let ((acc (seq:foldl (lambda (acc elm) (+ acc (expt (- elm m) 2))) 0 runs)))
      (/ acc
         (if (integer? bias) bias
             (- siz (if bias 1 0)))) ) ) )

;;

;; Computes the skewness for a set of numbers @runs, optionally applying
;; bias correction. @runs is a proper list of numeric values. Bias correction gets
;; enabled by setting `bias` to `#t`. Alternatively, it is possible to provide a
;; a positive integer, which is used instead of the number of elements in @runs.
(define (skewness runs #!optional (bias #t))
  (let ((n (seq:size runs))
        (m (arithmetic-mean runs))
        (s (stddev runs bias)) )
    (* (/ (seq:foldl (lambda (z x) (+ z (/ (expt (- x m) 3) n))) 0 runs)
          (expt s 3))
       (if bias
           (let ((n (if (integer? bias) bias n)))
             (/ (sqrt (* n (- n 1))) (- n 2)))
           1)) ) )

;; Computes the kurtosis for a set of numbers @runs, optionally applying
;; bias correction. @runs is a proper list of numeric values. Bias correction gets
;; enabled by setting `bias` to `#t`. Alternatively, it is possible to provide a
;; a positive integer, which is used instead of the number of elements in @runs.
(define (kurtosis runs #!optional (bias #t))
  (let ((n (seq:size runs))
        (m (arithmetic-mean runs))
        (s (stddev runs bias)))
    (/ (seq:foldl (lambda (z x) (+ z (expt (- x m) 4))) 0 runs)
       (* (expt s 4)
          (if (integer? bias)
              bias
              (if bias (- n 1) n)))) ) )

;;

;; Computes the covariance of two sets of numbers @runs1 and @runs2. Both @runs1 and
;; @runs2 are sequences of numbers. Bias correction can be enabled by setting
;; `bias` to `#t`. Alternatively, it is possible to provide a positive integer,
;; which is used instead of the number of elements in @runs1.
(define (covariance runs1 runs2 #!optional (bias #t))
  (let ((n (seq:size runs1)))
    (assert (= n (seq:size runs2)))
    (let ((xm (arithmetic-mean runs1))
          (ym (arithmetic-mean runs2)))
      ;FIXME need 2 seq foldl, coercion expensive
      (/ (fold (lambda (x y z) (+ z (* (- x xm) (- y ym))))
               0 (seq:->list runs1) (seq:->list runs2))
         (if (integer? bias)
             bias
             (- n (if bias 1 0)))) ) ) )

;; Computes the correlation of two sets of numbers @runs1 and @runs2 in form of
;; the Pearson product-moment correlation coefficient_. Both @runs1 and
;; @runs2 are sequences of numbers. Bias correction can be enabled by setting
;; `bias` to `#t`. Alternatively, it is possible to provide a positive integer,
;; which is used instead of the number of elements in @runs1.
(define (correlation runs1 runs2 #!optional (bias #t))
  (/ (covariance runs1 runs2 bias)
     (* (stddev runs1 bias) (stddev runs2 bias))) )

;;

(: chi-square-component (real real -> real))
(: yates-chi-square-component (real real -> real))

(define (yates-chi-square-component o e)
  (let ((r (- (abs (- o e)) 1/2)))
    (/ (* r r) e) ) )

(define (chi-square-component o e)
  (let ((r (- o e)))
    (/ (* r r) e) ) )

(define (chi-components component observed expected)
  (define (xpct-elt-comp s i)
    (component (seq:elt s i) (seq:elt expected (seq:index i))) )
  (if (seq:sequence? expected)
      (seq:smap* #() xpct-elt-comp observed)
      (seq:smap #() (cut component <> expected) observed) ) )

(define (chi-square observed expected #!optional yates?)
  ;
  (assert (seq:sequence? observed)
          'chi-square "observed not a sequence" observed expected yates?)
  (assert (or (seq:sequence? expected) (real? expected))
          'chi-square "expected not a sequence or real" observed expected yates?)
  (when (and (seq:sequence? observed) (seq:sequence? expected))
    (assert (= (seq:size expected) (seq:size observed))
            'chi-square "length observed not same length expected" observed expected yates?) )
  ;
  (let ((component (if yates? yates-chi-square-component chi-square-component)))
    (seq:reduce + 0 (chi-components component observed expected)) ) )

) ;module micro-stats