;; Tests for Batch Normalization and Global Average Pooling

(import scheme
        (chicken base)
        (chicken format)
        (srfi 1)
        (srfi 4)
        test
        nanograd-autograd
        nanograd-layer)

;;; ==================================================================
;;; Helper Functions
;;; ==================================================================

(define (approx-equal? actual expected tolerance)
  "Check if two numbers are approximately equal within tolerance"
  (<= (abs (- actual expected)) tolerance))

(define-syntax test-approximate
  (syntax-rules ()
    ((test-approximate name expected actual tolerance)
     (test-assert name (approx-equal? actual expected tolerance)))))

(define (vector-approx-equal? vec1 vec2 tolerance)
  "Check if two f32vectors are approximately equal within tolerance"
  (let ((n1 (f32vector-length vec1))
        (n2 (f32vector-length vec2)))
    (and (= n1 n2)
         (let loop ((i 0))
           (cond
            ((= i n1) #t)
            ((> (abs (- (f32vector-ref vec1 i)
                       (f32vector-ref vec2 i)))
                tolerance)
             #f)
            (else (loop (+ i 1))))))))

(define (test-vector-equal vec1 vec2 tolerance)
  "Test helper for vector equality with tolerance"
  (test-assert (vector-approx-equal? vec1 vec2 tolerance)))

(define (f32vector-copy v)
  (ssub v 0 (f32vector-length v)))

;;; ==================================================================
;;; BATCH NORMALIZATION TESTS
;;; ==================================================================

(test-group "BatchNorm - Training Forward Pass"
  
  ;; Create BatchNorm layer for 2 channels
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; Create input: 2 channels, 2x2 spatial
         ;; Channel 0: [[1,2], [3,4]] mean=2.5, var=1.25
         ;; Channel 1: [[5,6], [7,8]] mean=6.5, var=1.25
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0    ;; Channel 0
                           5.0 6.0 7.0 8.0)    ;; Channel 1
                 '(2 2 2)
                 requires-grad?: #t))
         (output (forward bn input)))
    
    (test "Output shape matches input"
      '(2 2 2)
      (tensor-shape output))
    
    ;; Check normalization (approximately zero mean, unit variance per channel)
    (let ((data (tensor-data output)))
      ;; Channel 0 normalized values
      (let* ((ch0-vals (list (f32vector-ref data 0)
                            (f32vector-ref data 1)
                            (f32vector-ref data 2)
                            (f32vector-ref data 3)))
             (ch0-mean (/ (apply + ch0-vals) 4.0))
             (ch0-var (/ (apply + (map (lambda (x) (* (- x ch0-mean) (- x ch0-mean)))
                                      ch0-vals))
                        4.0)))
        
        (test-approximate "Channel 0 normalized mean ~= 0"
          0.0 ch0-mean 1e-5)
        (test-approximate "Channel 0 normalized variance ~= 1"
          1.0 ch0-var 1e-4))
      
      ;; Channel 1 normalized values
      (let* ((ch1-vals (list (f32vector-ref data 4)
                            (f32vector-ref data 5)
                            (f32vector-ref data 6)
                            (f32vector-ref data 7)))
             (ch1-mean (/ (apply + ch1-vals) 4.0))
             (ch1-var (/ (apply + (map (lambda (x) (* (- x ch1-mean) (- x ch1-mean)))
                                      ch1-vals))
                        4.0)))
        
        (test-approximate "Channel 1 normalized mean ~= 0"
          0.0 ch1-mean 1e-5)
        (test-approximate "Channel 1 normalized variance ~= 1"
          1.0 ch1-var 1e-4)))))

(test-group "BatchNorm - Running Statistics"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; First batch
         (input1 (make-tensor32
                  (f32vector 0.0 0.0 0.0 0.0    ;; Channel 0: all zeros
                            10.0 10.0 10.0 10.0) ;; Channel 1: all tens
                  '(2 2 2)))
         (_ (forward bn input1))
         ;; Second batch with different statistics
         (input2 (make-tensor32
                  (f32vector 2.0 2.0 2.0 2.0    ;; Channel 0: all twos
                            20.0 20.0 20.0 20.0) ;; Channel 1: all twenties
                  '(2 2 2)))
         (_ (forward bn input2)))
    
    (test-assert "Running statistics updated after 2 batches" #t)))

(test-group "BatchNorm - Evaluation Mode"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         ;; Train on some data to populate running stats
         (_ (set-training-mode! bn #t))
         (train-input (make-tensor32
                       (f32vector 1.0 2.0 3.0 4.0
                                 5.0 6.0 7.0 8.0)
                       '(2 2 2)))
         (_ (forward bn train-input))
         ;; Switch to eval mode
         (_ (set-eval-mode! bn))
         ;; In eval mode, should use running statistics (deterministic)
         (eval-input1 (make-tensor32
                       (f32vector 10.0 20.0 30.0 40.0
                                 50.0 60.0 70.0 80.0)
                       '(2 2 2)))
         (output1 (forward bn eval-input1))
         ;; Same input should give same output in eval mode
         (eval-input2 (make-tensor32
                       (f32vector 10.0 20.0 30.0 40.0
                                 50.0 60.0 70.0 80.0)
                       '(2 2 2)))
         (output2 (forward bn eval-input2)))
    
    ;; Verify outputs are identical (deterministic)
    (let ((data1 (tensor-data output1))
          (data2 (tensor-data output2)))
      (test-approximate "Eval mode is deterministic [0]"
        (f32vector-ref data1 0) (f32vector-ref data2 0) 1e-6)
      (test-approximate "Eval mode is deterministic [3]"
        (f32vector-ref data1 3) (f32vector-ref data2 3) 1e-6)
      (test-approximate "Eval mode is deterministic [7]"
        (f32vector-ref data1 7) (f32vector-ref data2 7) 1e-6))))

(test-group "BatchNorm - Mode Switching"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)))
         ;; Training mode - uses batch statistics
         (_ (set-training-mode! bn #t))
         (train-output (forward bn input))
         ;; Eval mode - uses running statistics
         (_ (set-eval-mode! bn))
         (eval-output (forward bn input)))
    
    ;; Outputs should be different (different statistics used)
    (let ((train-data (tensor-data train-output))
          (eval-data (tensor-data eval-output)))
      (let ((diff (abs (- (f32vector-ref train-data 0)
                         (f32vector-ref eval-data 0)))))
        (test-assert "Training and eval modes produce different outputs"
          (> diff 1e-6))))
    
    ;; Switch back to training
    (set-training-mode! bn #t)
    (let ((train-output2 (forward bn input)))
      (test-assert "Can switch between training and eval modes" #t))))

(test-group "BatchNorm - Learnable Parameters"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; Get parameters (gamma, beta)
         (params (parameters bn))
         (gamma (car params))
         (beta (cadr params)))
    
    (test "BatchNorm has 2 parameters (gamma, beta)"
      2
      (length params))
    
    (test-approximate "Gamma initialized to 1.0"
      1.0 (f32vector-ref (tensor-data gamma) 0) 1e-6)
    
    (test-approximate "Beta initialized to 0.0"
      0.0 (f32vector-ref (tensor-data beta) 0) 1e-6)
    
    (test-assert "Gamma requires gradient"
      (tensor-requires-grad? gamma))
    
    (test-assert "Beta requires gradient"
      (tensor-requires-grad? beta))))

(test-group "BatchNorm - Gradient Flow"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         (output (forward bn input))
         ;; Simple loss: sum of outputs
         (loss (sum-tensor output))
         (_ (backward! loss)))
    
    (test-assert "Input has gradient after backward"
      (tensor-grad input))
    
    (let ((params (parameters bn)))
      (test-assert "Gamma has gradient after backward"
        (tensor-grad (car params)))
      
      (test-assert "Beta has gradient after backward"
        (tensor-grad (cadr params))))
    
    ;; Check gradient magnitudes are reasonable
    (let* ((grad-input (tensor-grad input))
           (max-grad (fold max -inf.0 (f32vector->list grad-input))))
      (test-assert "Input gradients have reasonable magnitude"
        (< (abs max-grad) 10.0)))))

(test-group "BatchNorm - Gamma/Beta Gradients"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; Known input: all ones for channel 0, all twos for channel 1
         (input (make-tensor32
                 (f32vector 1.0 1.0 1.0 1.0
                           2.0 2.0 2.0 2.0)
                 '(2 2 2)
                 requires-grad?: #t))
         (output (forward bn input))
         ;; Loss: sum of outputs
         (loss (sum-tensor output))
         (_ (backward! loss))
         (params (parameters bn))
         (gamma (car params))
         (beta (cadr params)))
    
    ;; Beta gradient should be the sum over all spatial dims = 4 (4 pixels per channel)
    ;; Since loss is sum of outputs, dL/doutput = 1 everywhere
    ;; dL/dbeta = sum(dL/doutput) = 4 per channel
    (let ((grad-beta (tensor-grad beta)))
      (test-approximate "Beta gradient channel 0 (sum over 4 pixels)"
        4.0 (f32vector-ref grad-beta 0) 1e-4)
      (test-approximate "Beta gradient channel 1 (sum over 4 pixels)"
        4.0 (f32vector-ref grad-beta 1) 1e-4))
    
    ;; Gamma gradient: sum of (dL/doutput * normalized_input)
    ;; For constant inputs, normalized = 0 (zero variance), so gamma grad = 0
    (let ((grad-gamma (tensor-grad gamma)))
      (test-approximate "Gamma gradient channel 0 (zero for constant input)"
        0.0 (f32vector-ref grad-gamma 0) 1e-4)
      (test-approximate "Gamma gradient channel 1 (zero for constant input)"
        0.0 (f32vector-ref grad-gamma 1) 1e-4))))

(test-group "BatchNorm - Different Spatial Sizes"
  
  (let* ((bn (make-batch-norm-2d 3 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; 3 channels, 1x1 spatial
         (input-1x1 (make-tensor32
                     (f32vector 1.0 2.0 3.0)
                     '(3 1 1)))
         (output-1x1 (forward bn input-1x1))
         ;; 3 channels, 4x4 spatial
         (input-4x4 (make-tensor32
                     (make-f32vector (* 3 4 4) 1.0)
                     '(3 4 4)))
         (output-4x4 (forward bn input-4x4))
         ;; 3 channels, 7x7 spatial (like after ResNet conv1)
         (input-7x7 (make-tensor32
                     (make-f32vector (* 3 7 7) 2.0)
                     '(3 7 7)))
         (output-7x7 (forward bn input-7x7)))
    
    (test "1x1 spatial works"
      '(3 1 1)
      (tensor-shape output-1x1))
    
    (test "4x4 spatial works"
      '(3 4 4)
      (tensor-shape output-4x4))
    
    (test "7x7 spatial works"
      '(3 7 7)
      (tensor-shape output-7x7))))

(test-group "BatchNorm - Numerical Gradient Check"
  
  (let* ((epsilon 1e-4)
         (bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         ;; Compute analytical gradient
         (output (forward bn input))
         (loss (sum-tensor output))
         (_ (backward! loss))
         (analytical-grad (tensor-grad input)))
    
    ;; Compute numerical gradient for first 4 elements
    (do ((i 0 (+ i 1)))
        ((= i 4))
      
      ;; Create fresh BN layer for each perturbation
      (let ((bn-plus (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
            (bn-minus (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32)))
        (set-training-mode! bn-plus #t)
        (set-training-mode! bn-minus #t)
        
        ;; Perturb +epsilon
        (let ((input-plus (make-tensor32
                          (f32vector-copy (tensor-data input))
                          '(2 2 2))))
          (f32vector-set! (tensor-data input-plus) i
                         (+ (f32vector-ref (tensor-data input) i) epsilon))
          
          (let* ((output-plus (forward bn-plus input-plus))
                 (loss-plus (f32vector-ref (tensor-data (sum-tensor output-plus)) 0)))
            
            ;; Perturb -epsilon
            (let ((input-minus (make-tensor32
                               (f32vector-copy (tensor-data input))
                               '(2 2 2))))
              (f32vector-set! (tensor-data input-minus) i
                             (- (f32vector-ref (tensor-data input) i) epsilon))
              
              (let* ((output-minus (forward bn-minus input-minus))
                     (loss-minus (f32vector-ref (tensor-data (sum-tensor output-minus)) 0)))
                
                (let ((numerical (/ (- loss-plus loss-minus) (* 2.0 epsilon)))
                      (analytical (f32vector-ref analytical-grad i)))
                  
                  (test-approximate (sprintf "Gradient check position ~A" i)
                    analytical numerical 1e-2))))))))))

(test-group "BatchNorm - Gradients with Varied Input"
  
  (let* ((bn (make-batch-norm-2d 2 epsilon: 1e-5 momentum: 0.1 dtype: 'f32))
         (_ (set-training-mode! bn #t))
         ;; Varied input (not constant per channel)
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         (output (forward bn input))
         ;; Use sum of squares loss instead of sum
         (output-flat (reshape output '(8)))
         (squared (mul output-flat output-flat))
         (loss (sum-tensor squared))
         (_ (backward! loss))
         (params (parameters bn))
         (gamma (car params))
         (beta (cadr params)))
    
    ;; With varied input and squared loss, gamma should have non-zero gradient
    (let ((grad-gamma (tensor-grad gamma)))
      (test-assert "Gamma gradient non-zero for varied input (ch0)"
        (> (abs (f32vector-ref grad-gamma 0)) 1e-6))
      (test-assert "Gamma gradient non-zero for varied input (ch1)"
        (> (abs (f32vector-ref grad-gamma 1)) 1e-6)))
    
    (let ((grad-beta (tensor-grad beta)))
      (test-assert "Beta gradients computed" #t))))

;;; ==================================================================
;;; GLOBAL AVERAGE POOLING TESTS
;;; ==================================================================

(test-group "Global Average Pooling - Forward Pass"
  
  ;; Create 2 channels, 2x2 spatial
  ;; Channel 0: [[1,2], [3,4]] mean = 2.5
  ;; Channel 1: [[5,6], [7,8]] mean = 6.5
  (let* ((input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         (output (global-avg-pool2d input)))
    
    (test "GAP output shape is (num_channels)"
      '(2)
      (tensor-shape output))
    
    (test-approximate "Channel 0 average is 2.5"
      2.5 (f32vector-ref (tensor-data output) 0) 1e-6)
    
    (test-approximate "Channel 1 average is 6.5"
      6.5 (f32vector-ref (tensor-data output) 1) 1e-6)))

(test-group "GAP - Different Spatial Sizes"
  
  ;; 3x3 spatial
  (let* ((input-3x3 (make-tensor32
                     (f32vector 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0)
                     '(1 3 3)))
         (output-3x3 (global-avg-pool2d input-3x3)))
    (test-approximate "3x3 spatial: mean of 1..9 is 5.0"
      5.0 (f32vector-ref (tensor-data output-3x3) 0) 1e-6))
  
  ;; 1x1 spatial (edge case)
  (let* ((input-1x1 (make-tensor32
                     (f32vector 7.0)
                     '(1 1 1)))
         (output-1x1 (global-avg-pool2d input-1x1)))
    (test-approximate "1x1 spatial: mean is the value itself"
      7.0 (f32vector-ref (tensor-data output-1x1) 0) 1e-6))
  
  ;; 7x7 spatial (typical ResNet)
  (let* ((input-7x7 (make-tensor32
                     (make-f32vector 49 10.0)  ;; All 10s
                     '(1 7 7)))
         (output-7x7 (global-avg-pool2d input-7x7)))
    (test-approximate "7x7 spatial with constant values"
      10.0 (f32vector-ref (tensor-data output-7x7) 0) 1e-6)))

(test-group "GAP - Multiple Channels"
  
  ;; 4 channels, 2x2 spatial
  (let* ((input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0      ;; Channel 0: mean = 2.5
                           10.0 20.0 30.0 40.0   ;; Channel 1: mean = 25.0
                           0.0 0.0 0.0 0.0       ;; Channel 2: mean = 0.0
                           -1.0 -2.0 -3.0 -4.0)  ;; Channel 3: mean = -2.5
                 '(4 2 2)))
         (output (global-avg-pool2d input)))
    
    (test "4 channels output shape"
      '(4)
      (tensor-shape output))
    
    (test-approximate "Channel 0 mean is 2.5"
      2.5 (f32vector-ref (tensor-data output) 0) 1e-6)
    (test-approximate "Channel 1 mean is 25.0"
      25.0 (f32vector-ref (tensor-data output) 1) 1e-6)
    (test-approximate "Channel 2 mean is 0.0"
      0.0 (f32vector-ref (tensor-data output) 2) 1e-6)
    (test-approximate "Channel 3 mean is -2.5"
      -2.5 (f32vector-ref (tensor-data output) 3) 1e-6)))

(test-group "GAP - Gradient Flow"
  
  (let* ((input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                           5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         (output (global-avg-pool2d input))
         ;; Create target and compute loss
         (target (make-tensor32 (f32vector 5.0 10.0) '(2)))
         (loss (mse-loss output target))
         (_ (backward! loss)))
    
    (test-assert "Input has gradient after backward"
      (tensor-grad input))
    
    ;; Check gradient distribution
    ;; MSE gradient: dL/doutput[i] = (2/n) * (output[i] - target[i])
    ;; Channel 0: 1/2 * (2/2) * (2.5 - 5.0) = -1.25
    ;; Channel 1: 1/2 * (2/2) * (6.5 - 10.0) = -1.75
    ;; GAP distributes equally over spatial dimensions:
    ;; Channel 0: -1.25 / 4 = -0.3125 per pixel
    ;; Channel 1: -1.75 / 4 = -0.4375 per pixel
    (let ((grad (tensor-grad input))
          (expected-ch0 -0.3125)
          (expected-ch1 -0.4375))
      
      (test-approximate "Channel 0 gradient distributed [0,0]"
        expected-ch0 (f32vector-ref grad 0) 1e-5)
      (test-approximate "Channel 0 gradient distributed [0,1]"
        expected-ch0 (f32vector-ref grad 1) 1e-5)
      (test-approximate "Channel 0 gradient distributed [1,0]"
        expected-ch0 (f32vector-ref grad 2) 1e-5)
      (test-approximate "Channel 0 gradient distributed [1,1]"
        expected-ch0 (f32vector-ref grad 3) 1e-5)
      
      (test-approximate "Channel 1 gradient distributed [0,0]"
        expected-ch1 (f32vector-ref grad 4) 1e-5)
      (test-approximate "Channel 1 gradient distributed [0,1]"
        expected-ch1 (f32vector-ref grad 5) 1e-5)
      (test-approximate "Channel 1 gradient distributed [1,0]"
        expected-ch1 (f32vector-ref grad 6) 1e-5)
      (test-approximate "Channel 1 gradient distributed [1,1]"
        expected-ch1 (f32vector-ref grad 7) 1e-5))))

(test-group "GAP - Equal Gradient Distribution"
  
  ;; All pixels should get equal gradient (key property of GAP)
  (let* ((input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0)
                 '(1 3 3)
                 requires-grad?: #t))
         (output (global-avg-pool2d input))
         ;; Gradient: scalar * 1 (simple case)
         (scaled (scale-op output 2.0))
         (_ (backward! scaled))
         (grad (tensor-grad input))
         (expected (/ 2.0 9.0)))
    
    ;; All 9 pixels should have same gradient: 2.0/9 ~= 0.222...
    (do ((i 0 (+ i 1)))
        ((= i 9))
      (test-approximate (sprintf "Pixel ~A has equal gradient" i)
        expected (f32vector-ref grad i) 1e-6))))

(test-group "GAP - Numerical Gradient Check"
  
  (let* ((epsilon 5e-4)
         (input (make-tensor32
                 (f32vector 1.0 2.0 3.0 4.0
                            5.0 6.0 7.0 8.0)
                 '(2 2 2)
                 requires-grad?: #t))
         ;; Compute analytical gradient
         (output (global-avg-pool2d input))
         (loss (sum-tensor output))
         (_ (backward! loss))
         (analytical-grad (tensor-grad input)))
    
    ;; Compute numerical gradient
    (do ((i 0 (+ i 1)))
        ((= i 8))
      
      ;; +epsilon
      (let ((input-plus (make-tensor32
                         (f32vector-copy (tensor-data input))
                         '(2 2 2))))
        (f32vector-set! (tensor-data input-plus) i
                       (+ (f32vector-ref (tensor-data input) i) epsilon))
        
        (let* ((output-plus (global-avg-pool2d input-plus))
               (loss-plus (compensated-sum 'f32 (tensor-data output-plus)
                                           0 (f32vector-length (tensor-data output-plus)))))
          
          ;; -epsilon
          (let ((input-minus (make-tensor32
                             (f32vector-copy (tensor-data input))
                             '(2 2 2))))
            (f32vector-set! (tensor-data input-minus) i
                            (- (f32vector-ref (tensor-data input) i) epsilon))
            
            (let* ((output-minus (global-avg-pool2d input-minus))
                   (loss-minus (compensated-sum 'f32 (tensor-data output-minus)
                                                0 (f32vector-length (tensor-data output-minus)))))
              
              (let ((numerical (/ (- loss-plus loss-minus) (* 2.0 epsilon)))
                    (analytical (f32vector-ref analytical-grad i)))
                
                (test-approximate (sprintf "GAP gradient check position ~A" i)
                  analytical numerical 1e-3)))))))))

;;; ==================================================================
;;; Run All Tests
;;; ==================================================================

(test-exit)