;; YASOS-based layer system for artificial neural networks

(module nanograd-layer
  (
   ;; Layer predicates and operations
   layer? dense-layer? sequential? flatten-layer?
   
   ;; Layer construction
   make-dense-layer make-sequential make-flatten
   make-conv2d-layer conv2d-layer?
   
   ;; Layer operations
   forward parameters zero-grad-layer!
   layer-input-size layer-output-size
   layer-activation layer-name layer-norm
   set-training-mode!
   set-eval-mode!
   
   ;; Batch Normalization
   make-batch-norm-2d
   batch-norm-2d?
   
   ;; Global Average Pooling
   global-avg-pool2d
   maxpool2d
   
   ;; Serialization operations
   layer->serializable serializable->layer
   save-layer load-layer
   save-model load-model
   
   ;; Activation functions (as objects)
   make-relu make-tanh make-sigmoid make-identity
   make-gelu make-silu
   activation? activation-name activation-forward

   ;; Network utilities
   print-layer summary
   )
  
  (import
   scheme
   (chicken base)
   (chicken format)
   (chicken random)
   (srfi 1)
   (srfi 4)
   yasos
   blas
   s11n
   nanograd-autograd
   )


  ;; Hygienic macro for dtype-based operation dispatch
  (include "with-dtype.scm")

  ;;; ==================================================================
  ;;; Activation Functions as YASOS Objects
  ;;; ==================================================================

  (define-predicate activation?)
  (define-operation (activation-forward act x))
  (define-operation (activation-name act))

  ;; ReLU Activation
  (define (make-relu)
    (object
     ((activation? self) #t)
     ((activation-name self) "ReLU")
     ((activation-forward self x) (relu x))))

  (define (make-tanh)
  (object
   ((activation? self) #t)
   ((activation-name self) "Tanh")
   ((activation-forward self x) (tanh-op x))))

  ;; Sigmoid activation
  (define (make-sigmoid)
    (object
     ((activation? self) #t)
     ((activation-name self) "Sigmoid")
     ((activation-forward self x) (sigmoid x))))

  ;; Identity Activation (no activation)
  (define (make-identity)
    (object
     ((activation? self) #t)
     ((activation-name self) "Identity")
     ((activation-forward self x) x)))

  ;; GeLU Activation
  (define (make-gelu)
    (object
     ((activation? self) #t)
     ((activation-name self) "GeLU")
     ((activation-forward self x) (gelu x))))

  ;; SiLU / Swish Activation
  (define (make-silu)
    (object
     ((activation? self) #t)
     ((activation-name self) "SiLU")
     ((activation-forward self x) (silu x))))

  ;;; ==================================================================
  ;;; Layer Base Operations
  ;;; ==================================================================

  (define-predicate layer?)
  (define-predicate dense-layer?)
  (define-predicate sequential?)
  (define-predicate flatten-layer?)
  
  (define-operation (forward layer . rest))
  (define-operation (parameters layer))
  (define-operation (zero-grad-layer! layer))
  (define-operation (layer-input-size layer))
  (define-operation (layer-output-size layer))
  (define-operation (layer-activation layer))
  (define-operation (layer-name layer))

  (define-operation (set-training-mode! layer training?))
  (define-operation (set-eval-mode! layer))

  ;; operations for layer serialization
  (define-operation (save-layer layer filepath))
  (define-operation (layer->serializable layer))

  ;;; ==================================================================
  ;;; Serialization/Deserialization Helpers
  ;;; ==================================================================

  ;; Tensor serialization (uses s11n for efficient storage)
  (define (tensor->serializable tensor)
    "Convert a tensor to a serializable representation"
    (let ((data (tensor-data tensor))
          (shape (tensor-shape tensor))
          (dtype (tensor-dtype tensor))
          (requires-grad (tensor-requires-grad? tensor)))
      
      `((dtype . ,dtype)
        (shape . ,shape)
        (requires-grad . ,requires-grad)
        (data . ,data))  ; handle SRFI-4 vectors with s11n
      ))

  (define (serializable->tensor serializable-tensor)
    "Reconstruct a tensor from serializable representation"
    (let* ((dtype (cdr (assq 'dtype serializable-tensor)))
           (shape (cdr (assq 'shape serializable-tensor)))
           (requires-grad (cdr (assq 'requires-grad serializable-tensor)))
           (data (cdr (assq 'data serializable-tensor))))
      (case dtype
        ((f32) (make-tensor32 data shape requires-grad: requires-grad))
        ((f64) (make-tensor64 data shape requires-grad: requires-grad))
        (else (error 'serializable->tensor 
                     (format #f "Unknown dtype: ~A" dtype))))))

  ;; Activation Function Serialization
  (define (activation->serializable act)
    "Convert an activation function to serializable representation"
    (let ((name (activation-name act)))
      `((type . activation)
        (name . ,name))))
  
  (define (serializable->activation serializable-act)
    "Reconstruct an activation function from serializable representation"
    (let ((name (cdr (assq 'name serializable-act))))
      (cond
       ((string=? name "ReLU") (make-relu))
       ((string=? name "Tanh") (make-tanh))
       ((string=? name "Sigmoid") (make-sigmoid))
       ((string=? name "Identity") (make-identity))
       ((string=? name "GeLU") (make-gelu))
       ((string=? name "SiLU") (make-silu))
       (else (error 'serializable->activation 
                    (format #f "Unknown activation function: ~A" name))))))

  ;;; ==================================================================
  ;;; Layer Deserialization with Dimension Checking
  ;;; ==================================================================

  (define (check-dimension-match expected actual context)
    "Verify that dimensions match, error if not"
    (unless (= expected actual)
      (error 'dimension-mismatch
             (format #f "~A: expected ~A but got ~A" 
                     context expected actual))))

  (define (verify-flatten-transition before-layer flatten-layer after-layer)
    "Verify that conv->flatten->dense transition is valid"
    (let ((conv-output (layer-output-size before-layer))
          (dense-input (layer-input-size after-layer)))
    
      (cond
       ;; If both are known, we can verify the flattening math
       ((and conv-output dense-input)
        ;; For conv layers, output size is num channels
        ;; Dense expects flattened size = channels * height * width
        ;; We need to check if dense-input is a multiple of conv-output
        (unless (and (> dense-input conv-output)
                     (zero? (modulo dense-input conv-output)))
          (printf "Warning: Flatten dimensions may be incompatible: ~A channels -> ~A features\n"
                  conv-output dense-input)))
       
       ;; If either is unknown, we can't verify
       (else
        (void)))))
  
  (define (serializable->layer serializable-repr)
    "Reconstruct a layer from its serializable representation with dimension checking"
    (let ((layer-type (cdr (assq 'type serializable-repr))))
      (cond
       ;; Dense Layer Deserialization
       ((eq? layer-type 'dense-layer)
        (let* ((name (cdr (assq 'name serializable-repr)))
               (input-size (cdr (assq 'input-size serializable-repr)))
               (output-size (cdr (assq 'output-size serializable-repr)))
               (dtype (cdr (assq 'dtype serializable-repr)))
               (weights-ser (cdr (assq 'weights serializable-repr)))
               (biases-ser (cdr (assq 'biases serializable-repr)))
               (activation-ser (cdr (assq 'activation serializable-repr)))
               
               ;; Deserialize tensors
               (weights (serializable->tensor weights-ser))
               (biases (serializable->tensor biases-ser))
               (activation (serializable->activation activation-ser))
               
               ;; Check dimensions
               (weight-shape (tensor-shape weights))
               (bias-shape (tensor-shape biases)))
          
          ;; Validate weight dimensions
          (check-dimension-match output-size (car weight-shape)
                                "Dense layer weight rows")
          (check-dimension-match input-size (cadr weight-shape)
                                "Dense layer weight columns")
          
          ;; Validate bias dimensions
          (check-dimension-match output-size (car bias-shape)
                                "Dense layer bias size")
          
          ;; Create layer with deserialized tensors
          (object
           ;; Type predicates
           ((layer? self) #t)
           ((dense-layer? self) #t)
           
           ;; Layer info
           ((layer-name self) name)
           ((layer-input-size self) input-size)
           ((layer-output-size self) output-size)
           ((layer-activation self) activation)
           
           ;; Forward pass
           ((forward self input)
            (let ((input-shape (tensor-shape input)))
              (unless (= (car input-shape) input-size)
                (error 'forward 
                       (format #f "Input size mismatch: expected ~A, got ~A"
                               input-size (car input-shape)))))
            
            (let* ((linear-output (matmul-op weights input))
                   (output-with-bias (add linear-output biases)))
              (activation-forward activation output-with-bias)))
           
           ;; Get all parameters
           ((parameters self)
            (list weights biases))
           
           ;; Zero gradients
           ((zero-grad-layer! self)
            (zero-grad! weights)
            (zero-grad! biases))

           ((layer->serializable self)
            `((type . dense-layer)
              (name . ,name)
              (input-size . ,input-size)
              (output-size . ,output-size)
              (dtype . ,dtype)
              (weights . ,(tensor->serializable weights))
              (biases . ,(tensor->serializable biases))
              (activation . ,(activation->serializable activation))))
           
           ((save-layer self filepath)
            (save-layer-to-file self filepath)))))
       
       ;; Conv2D Layer Deserialization
       ((eq? layer-type 'conv2d-layer)
        (let* ((name (cdr (assq 'name serializable-repr)))
               (in-channels (cdr (assq 'in-channels serializable-repr)))
               (out-channels (cdr (assq 'out-channels serializable-repr)))
               (kernel-size (cdr (assq 'kernel-size serializable-repr)))
               (dtype (cdr (assq 'dtype serializable-repr)))
               (stride (cdr (assq 'stride serializable-repr)))
               (padding (cdr (assq 'padding serializable-repr)))
               (weights-ser (cdr (assq 'weights serializable-repr)))
               (biases-ser (cdr (assq 'biases serializable-repr)))
               (activation-ser (cdr (assq 'activation serializable-repr)))
               
               ;; Deserialize tensors
               (weights (serializable->tensor weights-ser))
               (biases (serializable->tensor biases-ser))
               (activation (serializable->activation activation-ser))
               
               ;; Check dimensions
               (weight-shape (tensor-shape weights))
               (bias-shape (tensor-shape biases)))
          
          ;; Validate weight dimensions (out_channels, in_channels, KH, KW)
          (check-dimension-match out-channels (car weight-shape)
                                "Conv2D output channels")
          (check-dimension-match in-channels (cadr weight-shape)
                                "Conv2D input channels")
          (check-dimension-match kernel-size (caddr weight-shape)
                                "Conv2D kernel height")
          (check-dimension-match kernel-size (cadddr weight-shape)
                                "Conv2D kernel width")
          
          ;; Validate bias dimensions
          (check-dimension-match out-channels (car bias-shape)
                                "Conv2D bias size")

          (make-conv2d-layer in-channels out-channels kernel-size
                             stride: stride
                             padding: padding
                             activation: activation
                             dtype: dtype
                             name: name
                             weight-values: (tensor-data weights)
                             bias-values: (tensor-data biases))
          ))
       
       ;; Sequential Layer Deserialization
       ((eq? layer-type 'sequential)
        (let* ((name (cdr (assq 'name serializable-repr)))
               (layers-ser (cdr (assq 'layers serializable-repr)))
               
               ;; Recursively deserialize all layers
               (layers (map serializable->layer layers-ser)))

          ;; Verify layer connectivity (allowing for dynamic dimensions)
          (let loop ((remaining-layers layers))
            (when (>= (length remaining-layers) 2)
              (let ((curr-layer (car remaining-layers))
                    (next-layer (cadr remaining-layers)))
                
                (let ((curr-output (layer-output-size curr-layer))
                      (next-input (layer-input-size next-layer)))
                  
                  ;; Only verify when both dimensions are statically known
                  (cond
                   ((and curr-output next-input)
                    (check-dimension-match 
                     curr-output
                     next-input
                     (format #f "Sequential layer connectivity between ~A and ~A"
                             (layer-name curr-layer)
                             (layer-name next-layer))))
                   ;; Flatten in the middle - check if flattening makes sense
                   ((flatten-layer? next-layer)
                    (when (>= (length remaining-layers) 3)
                      (let ((after-flatten (caddr remaining-layers)))
                        (verify-flatten-transition curr-layer next-layer after-flatten))))
                   ))
              
                (loop (cdr remaining-layers)))
              ))
          (make-sequential layers name: name)
          ))

       ((eq? layer-type 'flatten-layer)
        (let ((name (cdr (assq 'name serializable-repr))))
          (make-flatten name: name)))
       
       (else
        (error 'serializable->layer 
               (format #f "Unknown layer type: ~A" layer-type))))))

  
  ;;; ==================================================================
  ;;; Dense (Fully Connected) Layer
  ;;; ==================================================================

  ;; Efficient dense layer that handles both 1D and 2D inputs
  ;; Uses BLAS GEMM for batch operations
  ;; Replace the make-dense-layer in layer.scm with this corrected version
;; The key fix: use the correct GEMM operation to get (batch-size, output-dim) layout

(define (make-dense-layer input-dim output-dim 
                          #!key 
                          (activation (make-identity))
                          (use-bias #t)
                          (dtype 'f32)
                          (name "Dense"))
  "Dense layer supporting both single vectors and batches.
   
   Input shapes:
   1D: (input_dim,) -> output: (output_dim,)
   2D: (batch_size, input_dim) -> output: (batch_size, output_dim)"

  (let* ((weight-data (with-dtype dtype
                                  (let ((w (vec (* output-dim input-dim) 0.0)))
                                    ;; Initialize with small random values
                                    (do ((i 0 (+ i 1)))
                                        ((= i (* output-dim input-dim)) w)
                                      (elt-set! w i 
                                                (* (sqrt (/ 2.0 input-dim))
                                                   (- (pseudo-random-real) 0.5)))))))
         (weight (case dtype
                   ((f32) (make-tensor32 weight-data (list output-dim input-dim)))
                   ((f64) (make-tensor64 weight-data (list output-dim input-dim)))))
       
         (bias (if use-bias
                   (case dtype
                     ((f32) (make-tensor32 (make-f32vector output-dim 0.0)
                                           (list output-dim)))
                     ((f64) (make-tensor64 (make-f64vector output-dim 0.0)
                                           (list output-dim))))
                   #f)))
  
    (object
     
     ((layer? self) #t)
     ((layer-name self) name)
     ((dense-layer? self) #t)

     ((layer-input-size self) input-dim)
     ((layer-output-size self) output-dim)
     ((layer-activation self) activation)
     
     ((forward self input)
      (let* ((input-shape (tensor-shape input))
             (ndim (length input-shape))
             (requires-grad? (or (tensor-requires-grad? input)
                                (tensor-requires-grad? weight)
                                (and bias (tensor-requires-grad? bias)))))
        
        (cond
         ;; Case 1: 1D input (single vector)
         ((= ndim 1)
          (let ((in-dim (car input-shape)))
            (unless (= in-dim input-dim)
              (error 'forward 
                     (format #f "Input size mismatch: expected ~A, got ~A"
                             input-dim in-dim)))
            
            ;; Standard matrix-vector multiplication
            (let ((output (matmul-op weight input)))
              (if bias
                  (activation-forward activation (add output bias))
                  (activation-forward activation output)))))
         
         ;; Case 2: 2D input (batch of vectors)
         ((= ndim 2)
          (let ((batch-size (car input-shape))
                (in-dim (cadr input-shape)))
            (unless (= in-dim input-dim)
              (error 'forward
                     (format #f "Input size mismatch: expected ~A, got ~A"
                             input-dim in-dim)))
            
            ;; Compute output = X @ W^T to get (batch-size, output-dim) directly
            ;; X: (batch-size, input-dim)
            ;; W: (output-dim, input-dim) - will be transposed to (input-dim, output-dim)
            ;; Result: (batch-size, output-dim)
            (let ((output-data (with-dtype dtype 
                                           (vec (* batch-size output-dim) 0.0))))
              
              (with-dtype dtype
                (gemm! RowMajor NoTrans Trans
                       batch-size output-dim input-dim  ; m, n, k
                       1.0 (tensor-data input) (tensor-data weight)
                       0.0 output-data
                       lda: input-dim     ; Leading dimension of X
                       ldb: input-dim     ; Leading dimension of W (before transpose)
                       ldc: output-dim))  ; Leading dimension of output
            
              (let ((output (make-base-tensor output-data 
                                              (list batch-size output-dim)
                                              dtype requires-grad?)))
              
                (when requires-grad?
                  (set-backward-fn!
                   output
                   (lambda ()
                     (let ((grad-out (tensor-grad output))
                           (data-weight (tensor-data weight))
                           (data-input (tensor-data input)))
                       
                       ;; Gradient w.r.t. input: grad_X = grad_out @ W
                       ;; grad_out: (batch-size, output-dim)
                       ;; W: (output-dim, input-dim)
                       ;; Result: (batch-size, input-dim)
                       (when (tensor-requires-grad? input)
                         (let ((grad-input (with-dtype dtype 
                                                       (vec (* batch-size input-dim) 0.0))))
                           (with-dtype dtype
                             (gemm! RowMajor NoTrans NoTrans
                                    batch-size input-dim output-dim  ; m, n, k
                                    1.0 grad-out data-weight
                                    0.0 grad-input
                                    lda: output-dim   ; Leading dim of grad-out
                                    ldb: input-dim    ; Leading dim of W
                                    ldc: input-dim))  ; Leading dim of grad-input
                           (add-to-grad! input grad-input)))
                       
                       ;; Gradient w.r.t. weight: grad_W = grad_out^T @ X
                       ;; grad_out^T: (output-dim, batch-size)
                       ;; X: (batch-size, input-dim)
                       ;; Result: (output-dim, input-dim)
                       (when (tensor-requires-grad? weight)
                         (let ((grad-weight (with-dtype dtype 
                                                        (vec (* output-dim input-dim) 0.0))))
                           (with-dtype dtype
                             (gemm! RowMajor Trans NoTrans
                                    output-dim input-dim batch-size  ; m, n, k
                                    1.0 grad-out data-input
                                    0.0 grad-weight
                                    lda: output-dim   ; Leading dim of grad-out (before transpose)
                                    ldb: input-dim    ; Leading dim of X
                                    ldc: input-dim))  ; Leading dim of grad-weight
                           (add-to-grad! weight grad-weight)))))
                   (list input weight)))
              
                ;; Add bias if present (broadcast across batch)
                (let ((output-with-bias
                       (if bias
                           (let ((bias-data (tensor-data bias))
                                 (biased-data (with-dtype dtype 
                                                          (vec (* batch-size output-dim) 0.0))))
                             ;; Copy output data
                             (with-dtype dtype
                               (copy-to biased-data output-data 
                                        size: (* batch-size output-dim)))
                             
                             ;; Add bias to each row
                             (with-dtype dtype
                               (do ((i 0 (+ i 1)))
                                   ((= i batch-size))
                                 (let ((row-offset (* i output-dim)))
                                   (axpy! output-dim 1.0 bias-data
                                          biased-data offsetY: row-offset))))
                             
                             (let ((biased-output (make-base-tensor biased-data
                                                                    (list batch-size output-dim)
                                                                    dtype requires-grad?)))
                               (when requires-grad?
                                 (set-backward-fn!
                                  biased-output
                                  (lambda ()
                                    (let ((grad-biased (tensor-grad biased-output)))
                                      ;; Gradient flows back to pre-bias output
                                      (when (tensor-requires-grad? output)
                                        (add-to-grad! output grad-biased))
                                      
                                      ;; Gradient w.r.t. bias: sum across batch dimension
                                      (when (and bias (tensor-requires-grad? bias))
                                        (let ((grad-bias (with-dtype dtype (vec output-dim 0.0))))
                                          (with-dtype dtype
                                            (do ((i 0 (+ i 1)))
                                                ((= i batch-size))
                                              (let ((row-offset (* i output-dim)))
                                                (axpy! output-dim 1.0 grad-biased
                                                       grad-bias offsetX: row-offset))))
                                          (add-to-grad! bias grad-bias)))))
                                  (list output bias)))
                               biased-output))
                           output)))
                  
                  ;; Apply activation
                  (activation-forward activation output-with-bias))))))
          
          (else
           (error 'forward 
                  (format #f "Dense layer only supports 1D or 2D inputs, got ~AD"
                          ndim))))))
   
      ((parameters self)
       (if bias
           (list weight bias)
           (list weight)))
      
      ((zero-grad-layer! self)
       (zero-grad! weight)
       (when bias (zero-grad! bias)))

      ((set-training-mode! self train?)
       (begin))
     
      ((set-eval-mode! self)
       (begin))

      ((layer->serializable self)
       `((type . dense-layer)
         (name . ,name)
         (input-size . ,input-dim)
         (output-size . ,output-dim)
         (dtype . ,dtype)
         (weights . ,(tensor->serializable weight))
         (biases . ,(if use-bias (tensor->serializable bias) #f))
         (activation . ,(activation->serializable activation))))
     
      ((save-layer self filepath)
       (save-layer-to-file self filepath))
      )))
  
  ;;; ==================================================================
  ;;; Sequential Container (chains layers)
  ;;; ==================================================================

  (define (make-sequential layers #!key (name "Sequential"))
    (let ((layer-list layers))
      (object
       ;; Type predicates
       ((layer? self) #t)
       ((sequential? self) #t)
       
       ;; Layer info
       ((layer-name self) name)
       ((layer-input-size self) 
        (if (null? layer-list)
            0
            (layer-input-size (car layer-list))))
       
       ((layer-output-size self)
        (if (null? layer-list)
            0
            (layer-output-size (last layer-list))))
       
       ;; Forward pass (chain through all layers)
       ((forward self input)
        (fold (lambda (layer x)
                (forward layer x))
              input
              layer-list))
       
       ;; Get all parameters from all layers
       ((parameters self)
        (append-map parameters layer-list))
       
       ;; Zero gradients for all layers
       ((zero-grad-layer! self)
        (for-each zero-grad-layer! layer-list))
       
       ;; Serialize sequential layer with all its sub-layers
       ((layer->serializable self)
        `((type . sequential)
          (name . ,name)
          (layers . ,(map layer->serializable layer-list))))
       
       ((save-layer self filepath)
        (save-layer-to-file self filepath)))))


  (define (make-flatten #!key (name "Flatten"))
    "Flatten layer: converts (N, C, H, W) -> (N, C*H*W) or (C, H, W) -> (C*H*W)"
    (object
     ((layer? self) #t)
     ((flatten-layer? self) #t)
     ((layer-name self) name)
     
     ((layer-input-size self) #f)   ; Unknown until runtime
     ((layer-output-size self) #f)  ; Unknown until runtime
     
     ((forward self input)
      (let* ((shape (tensor-shape input))
             (ndim (length shape)))
        (cond
         ;; 4D: (N, C, H, W) -> (N, C*H*W)
         ((= ndim 4)
          (let ((N (car shape))
                (flattened-size (apply * (cdr shape))))
            (reshape input (list N flattened-size))))
         
         ;; 3D: (C, H, W) -> (C*H*W)
         ((= ndim 3)
          (let ((flattened-size (apply * shape)))
            (reshape input (list flattened-size))))
         
         ;; Already flat
         ((= ndim 2) input)
         ((= ndim 1) input)
         
         (else
          (error 'flatten (format #f "Cannot flatten ~AD tensor" ndim))))))
   
     ((parameters self) '())  ; No parameters
     ((zero-grad-layer! self) (void))
     ((set-training-mode! self train?) (void))
     ((set-eval-mode! self) (void))
     
     ((layer->serializable self)
      `((type . flatten-layer)
        (name . ,name)))
     
     ((save-layer self filepath)
      (save-layer-to-file self filepath))))
  
  ;;; ==================================================================
  ;;; Layer Normalization
  ;;; ==================================================================
  
  (define (layer-norm x gamma beta #!key (epsilon 1e-5))
    ;; Normalize across features, scale by gamma, shift by beta
    (let* ((dtype (tensor-dtype x))
           (data-x (tensor-data x))
           (n (vector-length-for-dtype data-x dtype)))
      
      ;; Compute mean and variance
      (define (compute-stats)
        (let ((sum (with-dtype dtype
                               (let loop ((i 0) (sum 0.0))
                                 (if (= i n) sum
                                     (loop (+ i 1) (+ sum (elt-ref data-x i))))))))
          (let* ((mean (/ sum (exact->inexact n)))
                 (var-sum (with-dtype dtype
                                      (let loop ((i 0) (var-sum 0.0))
                                        (if (= i n) var-sum
                                            (let ((diff (- (elt-ref data-x i) mean)))
                                              (loop (+ i 1) (+ var-sum (* diff diff)))))))))
            (values mean (/ var-sum (exact->inexact n))))))
      
      (let-values (((mean variance) (compute-stats)))
        (let* ((std (sqrt (+ variance epsilon)))
               ;; Normalize, scale, shift
               (normalized
                (let ((norm-data (with-dtype dtype (vec n 0.0))))
                  (with-dtype dtype
                              (do ((i 0 (+ i 1)))
                                  ((= i n))
                                (elt-set! norm-data i
                                          (/ (- (elt-ref data-x i) mean) std))))
                  (make-base-tensor norm-data (tensor-shape x) dtype
                                    (tensor-requires-grad? x)))))
          
          ;; scaled = normalized * gamma
          (define scaled (mul normalized gamma))
          
          ;; output = scaled + beta
          (add scaled beta)))))


  ;;; ==================================================================
  ;;; Convolutional Layer
  ;;; ==================================================================

  (define-predicate conv2d-layer?)

  
  (define (make-conv2d-layer in-channels out-channels kernel-size
                             #!key
                             (stride 1)
                             (padding 0)
                             (activation (make-identity))
                             (dtype 'f32)
                             (weight-values #f)
                             (bias-values #f)
                             (name "Conv2D"))
    "2D convolutional layer with batch support
   
     Input shapes:
       3D: (C, H, W) - single image
       4D: (N, C, H, W) - batched images"

    (let* ((KH kernel-size)
           (KW kernel-size)
           
           ;; He initialization for conv layers
           (fan-in (* in-channels KH KW))
           (init-scale (sqrt (/ 2.0 fan-in)))
           
           ;; Initialize weights: (out_channels, in_channels, KH, KW)
           (weight-size (* out-channels in-channels KH KW))
           (weight-data (or weight-values
                             (with-dtype
                              dtype
                              (let ((w (vec weight-size 0.0)))
                                (do ((i 0 (+ i 1)))
                                    ((= i weight-size) w)
                                  (elt-set! w i
                                            (* init-scale
                                               (- (* 2.0 (pseudo-random-real)) 1.0))))))))
           
           ;; Initialize biases
           (bias-data (or bias-values
                          (with-dtype dtype (vec out-channels 0.0))))
           
           ;; Create parameter tensors
           (weights (case dtype
                      ((f32) (make-tensor32 weight-data 
                                            (list out-channels in-channels KH KW)))
                      ((f64) (make-tensor64 weight-data 
                                            (list out-channels in-channels KH KW)))))
           
           (biases (case dtype
                     ((f32) (make-tensor32 bias-data (list out-channels)))
                     ((f64) (make-tensor64 bias-data (list out-channels))))))
      
      (object
       ;; Type predicates
       ((layer? self) #t)
       ((conv2d-layer? self) #t)
       
       ;; Layer info
       ((layer-name self) name)
       ((layer-input-size self) in-channels)
       ((layer-output-size self) out-channels)
       ((layer-activation self) activation)
       
       ;; Forward pass with proper batch support
       ((forward self input)
        (let* ((ishape (tensor-shape input))
               (ndim (length ishape)))
          
          ;; Check input dimensions and validate channel count
          (cond
           ;; 3D input: (C, H, W)
           ((= ndim 3)
            (unless (= (car ishape) in-channels)
              (error 'forward 
                     (format #f "Input channel mismatch: expected ~A, got ~A"
                             in-channels (car ishape)))))
           
           ;; 4D input: (N, C, H, W)
           ((= ndim 4)
            (unless (= (cadr ishape) in-channels)
              (error 'forward 
                     (format #f "Input channel mismatch: expected ~A, got ~A (batch shape: ~A)"
                             in-channels (cadr ishape) ishape))))
           
           (else
            (error 'forward 
                   (format #f "Conv2D expects 3D (C,H,W) or 4D (N,C,H,W) input, got ~AD" 
                           ndim))))
          
          ;; Apply convolution (conv2d handles both 3D and 4D)
          (let ((conv-output (conv2d input weights biases
                                     stride: stride
                                     padding: padding)))
            ;; Apply activation
            (activation-forward activation conv-output))))
       
       ;; Get parameters
       ((parameters self)
        (list weights biases))
       
       ;; Zero gradients
       ((zero-grad-layer! self)
        (zero-grad! weights)
        (zero-grad! biases))
       
       ((set-training-mode! self train?)
        (begin))
       
       ((set-eval-mode! self)
        (begin))
       
       ((layer->serializable self)
        `((type . conv2d-layer)
          (name . ,name)
          (in-channels . ,in-channels)
          (out-channels . ,out-channels)
          (kernel-size . ,kernel-size)
          (stride . ,stride)
          (padding . ,padding)
          (dtype . ,dtype)
          (weights . ,(tensor->serializable weights))
          (biases . ,(tensor->serializable biases))
          (activation . ,(activation->serializable activation))))
       
       ((save-layer self filepath)
        (save-layer-to-file self filepath)))))


  ;; ==================================================================
  ;; MaxPool2D Layer
  ;; ==================================================================
  
  (define (maxpool2d input kernel-size #!key (stride #f))
    "2D max pooling operation.
   Input shape: (C, H, W)
   Output shape: (C, OH, OW)"
    
    (let* ((dtype (tensor-dtype input))
           (ishape (tensor-shape input))
           (C (car ishape))
           (H (cadr ishape))
           (W (caddr ishape))
           (data (tensor-data input))
           
           (KH kernel-size)
           (KW kernel-size)
           (stride-val (or stride kernel-size))
           
           ;; Output dimensions
           (OH (+ 1 (quotient (- H KH) stride-val)))
           (OW (+ 1 (quotient (- W KW) stride-val)))
           
           (output-data (with-dtype dtype (vec (* C OH OW) 0.0)))
           
           ;; Store indices for backward pass
           (max-indices (make-vector (* C OH OW))))
    
      ;; Forward: find max in each window
      (with-dtype
       dtype
       (do ((c 0 (+ c 1)))
           ((= c C))
         (do ((oh 0 (+ oh 1)))
             ((= oh OH))
           (do ((ow 0 (+ ow 1)))
               ((= ow OW))
             
             (let ((max-val -inf.0)
                   (max-idx 0))
               
               ;; Find max in kernel window
               (do ((kh 0 (+ kh 1)))
                   ((= kh KH))
                 (do ((kw 0 (+ kw 1)))
                     ((= kw KW))
                   
                   (let* ((ih (+ (* oh stride-val) kh))
                          (iw (+ (* ow stride-val) kw))
                          (input-idx (+ (* c H W) (* ih W) iw))
                          (val (elt-ref data input-idx)))
                     
                     (when (> val max-val)
                       (set! max-val val)
                       (set! max-idx input-idx)))))
              
               (let ((output-idx (+ (* c OH OW) (* oh OW) ow)))
                 (elt-set! output-data output-idx max-val)
                 (vector-set! max-indices output-idx max-idx)))))))
    
      (let ((result (make-base-tensor output-data 
                                      (list C OH OW)
                                      dtype
                                      (tensor-requires-grad? input))))
        
        (when (tensor-requires-grad? input)
          (set-backward-fn! result
                            (lambda ()
                              (let ((grad-out (tensor-grad result))
                                    (grad-in (with-dtype dtype (vec (* C H W) 0.0))))
                                
                                ;; Gradient flows only to max positions
                                (with-dtype
                                 dtype
                                 (do ((i 0 (+ i 1)))
                                     ((= i (* C OH OW)))
                                   (let ((max-pos (vector-ref max-indices i))
                                         (grad-val (elt-ref grad-out i)))
                                     (elt-set! grad-in max-pos
                                               (+ (elt-ref grad-in max-pos)
                                                  grad-val))))
                                 )
                                
                                (add-to-grad! input grad-in)))
                            (list input)))
        
        result)))


  ;;; ==================================================================
  ;;; Batch Normalization 2D
  ;;; ==================================================================
  
  (define-predicate batch-norm-2d?)
  
  
  (define (make-batch-norm-2d num-features 
                              #!key 
                              (epsilon 1e-5)
                              (momentum 0.1)
                              (dtype 'f32)
                              (name "BatchNorm2d"))
    "Batch Normalization for 2D convolutions with proper batch support.
   
     Normalizes activations across batch dimension:
     y = gamma * (x - mu) / sqrt(sigma^2 + epsilon) + beta
   
     Args:
       num-features: Number of channels (C)
       epsilon: Small constant for numerical stability
       momentum: Momentum for running statistics
       dtype: Data type
     
     Input shapes:
       3D: (C, H, W) - treated as batch of 1
       4D: (N, C, H, W) - standard batch"
  
    (let* (;; Learnable parameters
           (gamma (case dtype
                    ((f32) (make-tensor32 (make-f32vector num-features 1.0)
                                          (list num-features)))
                    ((f64) (make-tensor64 (make-f64vector num-features 1.0)
                                          (list num-features)))))
           (beta (case dtype
                   ((f32) (make-tensor32 (make-f32vector num-features 0.0)
                                         (list num-features)))
                   ((f64) (make-tensor64 (make-f64vector num-features 0.0)
                                         (list num-features)))))
           
           ;; Running statistics (not trainable)
           (running-mean (with-dtype dtype (vec num-features 0.0)))
           (running-var (with-dtype dtype (vec num-features 1.0)))
           
           ;; Training mode flag
           (training? #t))
      
      (object
       ((layer? self) #t)
       ((batch-norm-2d? self) #t)
       ((layer-name self) name)
       
       ;; Mode control
       ((set-training-mode! self train?)
        (set! training? train?))
       
       ((set-eval-mode! self)
        (set! training? #f))
       
       ((forward self input)
        "Forward pass through batch normalization.
       
         During training: Uses batch statistics
         During eval: Uses running statistics"
      
        (let* ((input-shape (tensor-shape input))
               (ndim (length input-shape))
               (C num-features))
          
          ;; Extract dimensions based on input shape
          (let-values (((N H W)
                        (cond
                         ;; 3D input: (C, H, W) - treat as batch of 1
                         ((= ndim 3)
                          (values 1 (cadr input-shape) (caddr input-shape)))
                         
                         ;; 4D input: (N, C, H, W)
                         ((= ndim 4)
                          (values (car input-shape) 
                                  (caddr input-shape) 
                                  (cadddr input-shape)))
                         
                         (else
                          (error 'batch-norm-2d 
                                 (format #f "Expected 3D or 4D input, got ~AD" ndim))))))
          
            (let ((spatial-size (* H W))
                  (batch-spatial (* N H W))
                  (input-data (tensor-data input)))
              
              (if training?
                  ;; Training mode: compute batch statistics
                  (let ((means (with-dtype dtype (vec C 0.0)))
                        (vars (with-dtype dtype (vec C 0.0))))
                    
                    ;; Compute mean for each channel across batch and spatial dims
                    (with-dtype dtype
                                (do ((c 0 (+ c 1)))
                                    ((= c C))
                                  (let ((sum 
                                         (let n-loop ((n 0) (sum 0.0))
                                           (if (= n N)
                                               sum
                                               (let i-loop ((i 0) (sum sum))
                                                 (if (= i spatial-size)
                                                     (n-loop (+ n 1) sum)
                                                     (let ((idx (+ (* n C spatial-size)
                                                                   (* c spatial-size)
                                                                   i)))
                                                       (i-loop (+ i 1)
                                                               (+ sum (elt-ref input-data idx))))))))))
                                    (elt-set! means c (/ sum batch-spatial)))))
                    
                    ;; Compute variance for each channel
                    (with-dtype dtype
                                (do ((c 0 (+ c 1)))
                                    ((= c C))
                                  (let* ((mean (elt-ref means c))
                                         (sum-sq 
                                          (let n-loop ((n 0) (sum-sq 0.0))
                                            (if (= n N)
                                                sum-sq
                                                (let i-loop ((i 0) (sum-sq sum-sq))
                                                  (if (= i spatial-size)
                                                      (n-loop (+ n 1) sum-sq)
                                                      (let* ((idx (+ (* n C spatial-size)
                                                                     (* c spatial-size)
                                                                     i))
                                                             (val (elt-ref input-data idx))
                                                             (diff (- val mean)))
                                                        (i-loop (+ i 1)
                                                                (+ sum-sq (* diff diff))))))))))
                                    (elt-set! vars c (/ sum-sq batch-spatial)))))
                    
                    ;; Update running statistics
                    (do ((c 0 (+ c 1)))
                        ((= c C))
                      (with-dtype dtype
                                  (let ((new-mean (elt-ref means c))
                                        (new-var (elt-ref vars c))
                                        (old-mean (elt-ref running-mean c))
                                        (old-var (elt-ref running-var c)))
                                    (elt-set! running-mean c
                                              (+ (* (- 1.0 momentum) old-mean)
                                                 (* momentum new-mean)))
                                    (elt-set! running-var c
                                              (+ (* (- 1.0 momentum) old-var)
                                                 (* momentum new-var))))))
                    
                    ;; Normalize using batch statistics
                    (let ((normalized-data 
                           (with-dtype dtype (vec (* N C spatial-size) 0.0))))
                      
                      (with-dtype dtype
                                  (do ((c 0 (+ c 1)))
                                      ((= c C))
                                    (let* ((mean (elt-ref means c))
                                           (var (elt-ref vars c))
                                           (gamma-val (elt-ref (tensor-data gamma) c))
                                           (beta-val (elt-ref (tensor-data beta) c))
                                           (std (sqrt (+ var epsilon))))
                                      
                                      (do ((n 0 (+ n 1)))
                                          ((= n N))
                                        (do ((i 0 (+ i 1)))
                                            ((= i spatial-size))
                                          (let ((idx (+ (* n C spatial-size)
                                                        (* c spatial-size)
                                                        i)))
                                            (let ((normalized 
                                                   (/ (- (elt-ref input-data idx) mean) std)))
                                              (elt-set! normalized-data idx
                                                        (+ (* gamma-val normalized) 
                                                           beta-val)))))))))
                      
                      (make-base-tensor normalized-data input-shape dtype
                                        (tensor-requires-grad? input))))
                
                ;; Eval mode: use running statistics
                  (let ((normalized-data 
                         (with-dtype dtype (vec (* N C spatial-size) 0.0))))
                    
                    (with-dtype dtype
                                (do ((c 0 (+ c 1)))
                                    ((= c C))
                                  (let* ((mean (elt-ref running-mean c))
                                         (var (elt-ref running-var c))
                                         (gamma-val (elt-ref (tensor-data gamma) c))
                                         (beta-val (elt-ref (tensor-data beta) c))
                                         (std (sqrt (+ var epsilon))))
                                    
                                    (do ((n 0 (+ n 1)))
                                        ((= n N))
                                      (do ((i 0 (+ i 1)))
                                          ((= i spatial-size))
                                        (let ((idx (+ (* n C spatial-size)
                                                      (* c spatial-size)
                                                      i)))
                                          (let ((normalized 
                                                 (/ (- (elt-ref input-data idx) mean) std)))
                                            (elt-set! normalized-data idx
                                                      (+ (* gamma-val normalized) 
                                                         beta-val)))))))))
                    
                    (make-base-tensor normalized-data input-shape dtype #f)))))))
     
       ((parameters self)
        (list gamma beta))
     
       ((zero-grad-layer! self)
        (zero-grad! gamma)
        (zero-grad! beta)))))

  ;;; ==================================================================
  ;;; Global Average Pooling
  ;;; ==================================================================

  (define (global-avg-pool2d input)
    "Global average pooling over spatial dimensions with batch support.
   
     Input shapes:
       3D: (C, H, W) -> Output: (C,)
       4D: (N, C, H, W) -> Output: (N, C)"
  
    (let* ((dtype (tensor-dtype input))
           (shape (tensor-shape input))
           (ndim (length shape)))
      
      (cond
       ;; 3D input: (C, H, W)
       ((= ndim 3)
        (let* ((C (car shape))
               (H (cadr shape))
               (W (caddr shape))
               (spatial-size (* (cadr shape) (caddr shape)))
               (data (tensor-data input))
               (output-data (with-dtype dtype (vec C 0.0))))
          
          ;; Average over spatial dimensions for each channel
          (with-dtype dtype
                      (do ((c 0 (+ c 1)))
                          ((= c C))
                        (let ((sum
                               (let loop ((sum 0.0) (i 0))
                                 (if (< i spatial-size)
                                     (let ((idx (+ (* c spatial-size) i)))
                                       (loop (+ sum (elt-ref data idx))
                                             (+ i 1)))
                                     sum))))
                          (elt-set! output-data c (/ sum spatial-size)))))
          
          (let ((result (make-base-tensor output-data (list C) dtype
                                          (tensor-requires-grad? input))))
            
            ;; Backward pass
            (when (tensor-requires-grad? input)
              (set-backward-fn!
               result
               (lambda ()
                 (let ((grad-out (tensor-grad result))
                       (grad-in (with-dtype dtype (vec (* C H W) 0.0))))
                   
                   (with-dtype dtype
                               (do ((c 0 (+ c 1)))
                                   ((= c C))
                                 (let* ((grad-val (elt-ref grad-out c))
                                        (grad-per-pixel (/ grad-val spatial-size)))
                                   (do ((i 0 (+ i 1)))
                                       ((= i spatial-size))
                                     (let ((idx (+ (* c spatial-size) i)))
                                       (elt-set! grad-in idx grad-per-pixel))))))
                   
                   (add-to-grad! input grad-in)))
               (list input)))
            
            result)))
       
       ;; 4D input: (N, C, H, W)
       ((= ndim 4)
        (let* ((N (car shape))
               (C (cadr shape))
               (H (caddr shape))
               (W (cadddr shape))
               (spatial-size (* (caddr shape) (cadddr shape)))
               (data (tensor-data input))
               (output-data (with-dtype dtype (vec (* N C) 0.0))))
          
          ;; Average over spatial dimensions for each (batch, channel) pair
          (with-dtype dtype
                      (do ((n 0 (+ n 1)))
                          ((= n N))
                        (do ((c 0 (+ c 1)))
                            ((= c C))
                          (let ((sum
                                 (let loop ((sum 0.0) (i 0))
                                   (if (< i spatial-size)
                                       (let ((idx (+ (* n C spatial-size)
                                                     (* c spatial-size)
                                                     i)))
                                         (loop (+ sum (elt-ref data idx))
                                               (+ i 1)))
                                       sum))))
                            (elt-set! output-data (+ (* n C) c) 
                                      (/ sum spatial-size))))))
          
          (let ((result (make-base-tensor output-data (list N C) dtype
                                          (tensor-requires-grad? input))))
            
            ;; Backward pass
            (when (tensor-requires-grad? input)
              (set-backward-fn!
               result
               (lambda ()
                 (let ((grad-out (tensor-grad result))
                       (grad-in (with-dtype dtype (vec (* N C H W) 0.0))))
                   
                   (with-dtype dtype
                               (do ((n 0 (+ n 1)))
                                   ((= n N))
                                 (do ((c 0 (+ c 1)))
                                     ((= c C))
                                   (let* ((grad-val (elt-ref grad-out (+ (* n C) c)))
                                          (grad-per-pixel (/ grad-val spatial-size)))
                                     (do ((i 0 (+ i 1)))
                                         ((= i spatial-size))
                                       (let ((idx (+ (* n C spatial-size)
                                                     (* c spatial-size)
                                                     i)))
                                         (elt-set! grad-in idx grad-per-pixel)))))))
                   
                   (add-to-grad! input grad-in)))
               (list input)))
            
            result)))
       
       (else
        (error 'global-avg-pool2d 
               (format #f "Expected 3D or 4D input, got ~AD" ndim))))))
  
  ;;; ==================================================================
  ;;; Utilities
  ;;; ==================================================================

  ;; Print layer information
  (define (print-layer layer #!optional (indent 0))
    (let ((spaces (make-string indent #\space)))
      (printf "~A~A: " spaces (layer-name layer))
      (cond
       ((dense-layer? layer)
        (printf "Dense(~A -> ~A, activation=~A)\n"
                (layer-input-size layer)
                (layer-output-size layer)
                (activation-name (layer-activation layer))))
       ((sequential? layer)
        (printf "Sequential\n")
        (let ((params (parameters layer)))
          (printf "~A  Total parameters: ~A\n" 
                  spaces
                  (fold (lambda (p acc)
                          (let ((data (tensor-data p)))
                            (+ acc (case (tensor-dtype p)
                                    ((f32) (f32vector-length data))
                                    ((f64) (f64vector-length data))))))
                        0
                        params))))
       (else
        (printf "Layer\n")))))

  ;; Print model summary
  (define (summary model)
    (printf "\n=== Model Summary ===\n")
    (printf "Model: ~A\n" (layer-name model))
    (printf "Input size: ~A\n" (layer-input-size model))
    (printf "Output size: ~A\n\n" (layer-output-size model))
    
    (cond
     ((sequential? model)
      (printf "Layers:\n")
      (let ((params (parameters model)))
        (for-each
         (lambda (layer) (print-tensor layer))
         params)
        ))
     
     (else
      (print-layer model)))
    
    (let ((params (parameters model)))
      (printf "\nTotal parameters: ~A\n"
              (fold (lambda (p acc)
                      (let ((data (tensor-data p)))
                        (+ acc (case (tensor-dtype p)
                                ((f32) (f32vector-length data))
                                ((f64) (f64vector-length data))))))
                    0
                    params)))
    (printf "===================\n\n"))

  ;;; ==================================================================
  ;;; File I/O Operations
  ;;; ==================================================================

  (define (save-layer-to-file layer filepath)
    "Save a layer to a file using s11n serialization"
    (let ((serializable (layer->serializable layer)))
      (with-output-to-file filepath
        (lambda ()
          (serialize serializable)))))

  (define (load-layer-from-file filepath)
    "Load a layer from a file using s11n deserialization"
    (let ((serializable (with-input-from-file filepath
                          (lambda ()
                            (deserialize)))))
      (serializable->layer serializable)))

  ;; Public API for layer save/load
  (define (save-layer layer filepath)
    "Public API: Save a layer to file"
    (save-layer-to-file layer filepath))
  
  (define (load-layer filepath)
    "Public API: Load a layer from file"
    (load-layer-from-file filepath))

  ;; Model save/load (alias for sequential models)
  (define (save-model model filepath)
    "Save a model (sequential or single layer) to file"
    (save-layer-to-file model filepath))
  
  (define (load-model filepath)
    "Load a model from file"
    (load-layer-from-file filepath))
  
) ;; end module