;; Convolutional neural network example
;; Demonstrates building, training, and evaluating a CNN for image classification

(import scheme
        (chicken base)
        (chicken format)
        (chicken random)
        (chicken time)
        (chicken file)
        (chicken io)
        (srfi 1)
        (srfi 4)
        (srfi 42)
        (srfi 69)
        blas
        nanograd-autograd
        nanograd-layer
        nanograd-optimizer
        nanograd-diagnostics)

(define (f32vector-fold f x0 v . rest)
    (let ((n   (f32vector-length v))
	  (vs  (cons v rest)))
      (fold-ec x0 (:range i 0 n)
	       (map (lambda (v) (f32vector-ref v i)) vs)
	       (lambda (x ax) (apply f (append x (list ax)))))))

(define (set-random-seed! seed)
  "Set random seed for reproducibility"
  (set-pseudo-random-seed! (number->string seed))
  )

;;; ==================================================================
;;; Data Generation: Synthetic Image Dataset
;;; ==================================================================

;; Generate synthetic 28x28 grayscale images with simple patterns
;; Class 0: Vertical lines
;; Class 1: Horizontal lines
;; Class 2: Diagonal lines (top-left to bottom-right)
;; Class 3: Diagonal lines (top-right to bottom-left)

(define image-size 28)
(define num-channels 1)
(define num-classes 4)

(define (make-blank-image)
  (make-f32vector (* num-channels image-size image-size) 0.0))

(define (add-noise! img noise-level)
  "Add random noise to image"
  (let ((n (f32vector-length img)))
    (do ((i 0 (+ i 1)))
        ((= i n))
      (f32vector-set! img i
                      (+ (f32vector-ref img i)
                         (* noise-level (- (pseudo-random-real) 0.5)))))))

(define (generate-vertical-lines)
  "Generate image with vertical lines (Class 0)"
  (let ((img (make-blank-image)))
    ;; Add 3-5 vertical lines
    (let ((num-lines (+ 3 (pseudo-random-integer 3))))
      (do ((line 0 (+ line 1)))
          ((= line num-lines))
        (let ((x (pseudo-random-integer image-size)))
          ;; Draw vertical line with some width
          (do ((y 0 (+ y 1)))
              ((= y image-size))
            (do ((dx -1 (+ dx 1)))
                ((> dx 1))
              (let ((xx (+ x dx)))
                (when (and (>= xx 0) (< xx image-size))
                  (let ((idx (+ (* y image-size) xx)))
                    (f32vector-set! img idx 1.0)))))))))
    (add-noise! img 0.1)
    img))

(define (generate-horizontal-lines)
  "Generate image with horizontal lines (Class 1)"
  (let ((img (make-blank-image)))
    (let ((num-lines (+ 3 (pseudo-random-integer 3))))
      (do ((line 0 (+ line 1)))
          ((= line num-lines))
        (let ((y (pseudo-random-integer image-size)))
          (do ((x 0 (+ x 1)))
              ((= x image-size))
            (do ((dy -1 (+ dy 1)))
                ((> dy 1))
              (let ((yy (+ y dy)))
                (when (and (>= yy 0) (< yy image-size))
                  (let ((idx (+ (* yy image-size) x)))
                    (f32vector-set! img idx 1.0)))))))))
    (add-noise! img 0.1)
    img))

(define (generate-diagonal-lr)
  "Generate image with diagonal lines top-left to bottom-right (Class 2)"
  (let ((img (make-blank-image)))
    (let ((num-lines (+ 2 (pseudo-random-integer 2))))
      (do ((line 0 (+ line 1)))
          ((= line num-lines))
        (let ((offset (- (pseudo-random-integer (* 2 image-size)) image-size)))
          (do ((i 0 (+ i 1)))
              ((= i image-size))
            (let ((x i)
                  (y (+ i offset)))
              (when (and (>= y 0) (< y image-size))
                (do ((d -1 (+ d 1)))
                    ((> d 1))
                  (let ((xx (+ x d))
                        (yy (+ y d)))
                    (when (and (>= xx 0) (< xx image-size)
                              (>= yy 0) (< yy image-size))
                      (let ((idx (+ (* yy image-size) xx)))
                        (f32vector-set! img idx 1.0)))))))))))
    (add-noise! img 0.1)
    img))

(define (generate-diagonal-rl)
  "Generate image with diagonal lines top-right to bottom-left (Class 3)"
  (let ((img (make-blank-image)))
    (let ((num-lines (+ 2 (pseudo-random-integer 2))))
      (do ((line 0 (+ line 1)))
          ((= line num-lines))
        (let ((offset (pseudo-random-integer (* 2 image-size))))
          (do ((i 0 (+ i 1)))
              ((= i image-size))
            (let ((x (- image-size 1 i))
                  (y (- (+ i offset) image-size)))
              (when (and (>= y 0) (< y image-size))
                (do ((d -1 (+ d 1)))
                    ((> d 1))
                  (let ((xx (+ x d))
                        (yy (+ y d)))
                    (when (and (>= xx 0) (< xx image-size)
                              (>= yy 0) (< yy image-size))
                      (let ((idx (+ (* yy image-size) xx)))
                        (f32vector-set! img idx 1.0)))))))))))
    (add-noise! img 0.1)
    img))

(define (generate-sample class)
  "Generate a sample image for given class"
  (case class
    ((0) (generate-vertical-lines))
    ((1) (generate-horizontal-lines))
    ((2) (generate-diagonal-lr))
    ((3) (generate-diagonal-rl))
    (else (error "Invalid class"))))

(define (generate-dataset n-per-class)
  "Generate balanced dataset with n samples per class"
  (let ((dataset '()))
    (do ((class 0 (+ class 1)))
        ((= class num-classes) (reverse (shuffle dataset)))
      (do ((i 0 (+ i 1)))
          ((= i n-per-class))
        (let ((img (generate-sample class)))
          (set! dataset (cons (cons img class) dataset)))))
    dataset))

(define (shuffle lst)
  "Fisher-Yates shuffle"
  (let* ((vec (list->vector lst))
         (n (vector-length vec)))
    (do ((i (- n 1) (- i 1)))
        ((< i 1) (vector->list vec))
      (let* ((j (pseudo-random-integer (+ i 1)))
             (tmp (vector-ref vec i)))
        (vector-set! vec i (vector-ref vec j))
        (vector-set! vec j tmp)))
    ))

(define (fill-ones! vec dtype)
  (let ((n (vector-length-for-dtype vec dtype)))
    (case dtype
      ((f32) (do ((i 0 (+ i 1)))
                 ((= i n))
               (f32vector-set! vec i 1.0)))
      ((f64) (do ((i 0 (+ i 1)))
                 ((= i n))
               (f64vector-set! vec i 1.0))))))

;;; ==================================================================
;;; Batch Construction
;;; ==================================================================

(define (stack-images batch)
  "Stack a batch of images into a single 4D tensor (N, C, H, W)"
  (let* ((batch-size (length batch))
         (sample-img (caar batch))
         (img-size (f32vector-length sample-img))
         ;; Create batched tensor data
         (batched-data (make-f32vector (* batch-size img-size) 0.0)))
    
    ;; Copy each image into the batched tensor
    (do ((i 0 (+ i 1)))
        ((= i batch-size))
      (let ((img-data (car (list-ref batch i)))
            (offset (* i img-size)))
        (do ((j 0 (+ j 1)))
            ((= j img-size))
          (f32vector-set! batched-data (+ offset j)
                         (f32vector-ref img-data j)))))
    
    ;; Return as 4D tensor: (batch_size, channels, height, width)
    (make-tensor32 batched-data 
                   (list batch-size num-channels image-size image-size)
                   requires-grad?: #f)))

(define (stack-targets batch)
  "Stack batch targets into 2D tensor (N, num_classes) for one-hot
   or 1D tensor (N,) for class indices"
  (let* ((batch-size (length batch))
         ;; Use class indices format for efficiency
         (target-data (make-f32vector batch-size 0.0)))
    
    (do ((i 0 (+ i 1)))
        ((= i batch-size))
      (let ((class (cdr (list-ref batch i))))
        (f32vector-set! target-data i (exact->inexact class))))
    
    ;; Return as 1D tensor of class indices
    (make-tensor32 target-data (list batch-size))))


;;; ==================================================================
;;; CNN Architecture
;;; ==================================================================

(define (build-cnn)
  "Build a simple CNN for 28x28 grayscale image classification
   Architecture:
   - Conv2D: 1->16 channels, 3x3 kernel, stride=1, padding=1
   - ReLU
   - Conv2D: 16->32 channels, 3x3 kernel, stride=2, padding=1  (14x14)
   - ReLU
   - Conv2D: 32->64 channels, 3x3 kernel, stride=2, padding=1  (7x7)
   - ReLU
   - Flatten: 64*7*7 = 3136
   - Dense: 3136 -> 128
   - ReLU
   - Dense: 128 -> 4 (num classes)"
  ;; Create the conv and dense parts separately since we need to
  ;; explicitly flatten between them
  (let* (
         (conv-layers-list
          (list
           (make-conv2d-layer 1 16 3 
                              stride: 1 
                              padding: 1
                              activation: (make-relu)
                              name: "Conv1")
           
           (make-conv2d-layer 16 32 3
                              stride: 2
                              padding: 1
                              activation: (make-relu)
                              name: "Conv2")
           
           (make-conv2d-layer 32 64 3
                              stride: 2
                              padding: 1
                              activation: (make-relu)
                              name: "Conv3")))
         (dense-layers-list
          (list
           (make-dense-layer (* 64 7 7) 128
                             activation: (make-relu)
                             name: "FC1")
           
           (make-dense-layer 128 num-classes
                             activation: (make-identity)
                             name: "Output")))
         
         
        (model
         (make-sequential
          `(,@conv-layers-list
            ,(make-flatten name: "Flatten")
            ,@dense-layers-list)
          name: "CNN"))
        )
    
    (list model
          conv-layers-list dense-layers-list)
  ))

;;; ==================================================================
;;; Helper Functions
;;; ==================================================================

(define (one-hot class num-classes)
  "Convert class index to one-hot vector"
  (let ((vec (make-f32vector num-classes 0.0)))
    (f32vector-set! vec class 1.0)
    vec))

(define (argmax vec)
  "Return index of maximum value"
  (let loop ((i 1) (max-i 0) (max-val (f32vector-ref vec 0)))
    (if (= i (f32vector-length vec))
        max-i
        (let ((val (f32vector-ref vec i)))
          (if (> val max-val)
              (loop (+ i 1) i val)
              (loop (+ i 1) max-i max-val))))))

(define (flatten-tensor tensor)
  "Flatten a multi-dimensional tensor to 1D"
  (let* ((shape (tensor-shape tensor))
         (total-size (apply * shape)))
    (reshape tensor (list total-size))))

(define (forward-cnn model x)
  "Forward pass through CNN with explicit flattening"
  ;; Forward through convolutional layers
  (let* ((model-layer (car model))
         (logits (forward model-layer x)))
      logits))

;;; ==================================================================
;;; Training Functions
;;; ==================================================================

(define (train-epoch model optimizer train-data #!key (batch-size 32))
  "Train for one epoch with mini-batch gradient accumulation"
  (let ((total-loss 0.0)
        (correct 0)
        (n (length train-data))
        (conv-layers (car model))
        (dense-layers (cadr model))
        (conv-layers-internal (caddr model)))
    
    ;; Split data into mini-batches
    (let ((batches (let loop ((remaining train-data)
                             (result '()))
                    (if (null? remaining)
                        (reverse result)
                        (let* ((batch-end (min batch-size (length remaining)))
                               (batch (take remaining batch-end))
                               (rest (drop remaining batch-end)))
                          (loop rest (cons batch result))))))
          
          (batch-num 1))

      ;; Process each mini-batch
      (for-each
       (lambda (batch)
         (let ((actual-batch-size (length batch)))  ; Might be smaller for last batch

           
           ;; Accumulate gradients over the batch
           
           (for-each
            (lambda (sample)
              (let* ((img-data (car sample))
                     (true-class (cdr sample))
                     (img (make-tensor32 img-data 
                                         (list num-channels image-size image-size)))
                     (target (make-tensor32 (one-hot true-class num-classes)
                                            (list num-classes)))
                     
                     ;; Forward pass with manual flattening
                     (conv-out (forward conv-layers img))
                     ;(d (printf "Batch ~A: conv-out = ~A\n" batch-num (tensor-data conv-out)))
                     (flat (flatten-tensor conv-out))
                     (logits (forward dense-layers flat))
                     (probs (softmax logits))
                     (loss (cross-entropy-loss probs target)))

                ;(printf "Batch ~A: logits = ~A probs = ~A\n"
                ;        batch-num (tensor-data logits) (tensor-data probs))
                ;(printf "Batch ~A: true-class = ~A target = ~A loss = ~A\n"
                ;        batch-num true-class (tensor-data target) (tensor-data loss))

                ;; Accumulate metrics
                (set! total-loss (+ total-loss 
                                   (f32vector-ref (tensor-data loss) 0)))
                
                (let* ((pred-data (tensor-data logits))
                       (pred-class (argmax pred-data)))
                  (when (= pred-class true-class)
                    (set! correct (+ correct 1))))
                
                ;; Reset loss gradient before backward 
                ;(fill-ones! (tensor-grad loss) (tensor-dtype loss))

                ;; Backward pass - gradients accumulate automatically
                (backward! loss)

                ;(printf "Loss grad after backward: ~A\n" (tensor-grad loss))
                #;(printf "First param grad: ~A\n" 
                        (let ((p (car (parameters conv-layers))))
                          (tensor-grad p)))
                
                ))
            batch)

           ;; Scale gradients by 1/batch_size to get average
           (let ((scale-factor (/ 1.0 actual-batch-size)))
             (for-each
              (lambda (param)
                (let ((grad (tensor-grad param)))
                  (when grad
                    (let ((n (f32vector-length grad)))
                      (case (tensor-dtype param)
                        ((f32) (sscal! n scale-factor grad))
                        ((f64) (dscal! n scale-factor grad)))
                      ))))
              (append (parameters conv-layers)
                      (parameters dense-layers))))

           ;; Update parameters once per batch (with averaged gradients)
           (step! optimizer)

           ;; Zero gradients for next batch
           (zero-grad-layer! conv-layers)
           (zero-grad-layer! dense-layers)

           (set! batch-num (+ 1 batch-num))
         ))
       batches))
    
    (values (/ total-loss n) (/ correct n))))


(define (train-epoch-batched model optimizer train-data 
                             #!key (batch-size 32) (gradient-diagnostics #f))
  "Train for one epoch with true batch processing"
  (let* ((total-loss 0.0)
         (correct 0)
         (n (length train-data))
         (model-layer (car model))
         (params (parameters model-layer))
         (monitor (make-gradient-monitor 
                   exploding-threshold: 10.0
                   vanishing-threshold: 1e-7)))

    
    ;; Split data into mini-batches
    (let ((batches (let loop ((remaining train-data)
                              (i 0)
                              (result '()))
                    (if (null? remaining)
                        (reverse result)
                        (let* ((batch-end (min batch-size (length remaining)))
                               (batch (take remaining batch-end))
                               (rest (drop remaining batch-end)))
                          (loop rest (+ i 1) (cons (cons i batch) result)))))))

      ;; Process each mini-batch
      (for-each
       (lambda (step+batch)
         (let* ((step (car step+batch))
                (batch (cdr step+batch))
                (actual-batch-size (length batch))
                
                ;; Stack entire batch into single tensors
                (batch-images (stack-images batch))
                (batch-targets (stack-targets batch))

                ;; Single forward pass for entire batch
                (logits (forward model-layer batch-images))
                
                ;; Flatten: (N, C, H, W) -> (N, C*H*W)
                ;(batch-shape (tensor-shape conv-out))
                ;(N (car batch-shape))
                ;(flattened-size (apply * (cdr batch-shape)))
                ;(flat (reshape conv-out (list N flattened-size)))
                
                ;; Dense layers: (N, features_in) -> (N, num_classes)
                ;(logits (forward dense-layers flat))
                
                ;; Softmax over classes: (N, num_classes)
                (probs (softmax logits axis: -1))
                
                ;; Cross-entropy loss with mean reduction
                ;; Returns scalar loss averaged over batch
                (loss (cross-entropy-loss probs batch-targets 
                                          reduction: 'mean
                                          from-logits: #f)))

           ;(printf "mini-batch logits: ~A\n" (tensor-data logits))
           ;(printf "mini-batch probs: ~A\n" (tensor-data probs))
           ;(printf "mini-batch batch-targets: ~A\n" (tensor-data batch-targets))
           ;(printf "mini-batch loss: ~A\n" (tensor-data loss))
           
           ;; Accumulate loss for reporting
           (set! total-loss (+ total-loss 
                              (* (f32vector-ref (tensor-data loss) 0)
                                 actual-batch-size)))
           
           ;; Count correct predictions
           (let ((logits-data (tensor-data logits)))
             (do ((i 0 (+ i 1)))
                 ((= i actual-batch-size))
               (let* ((offset (* i num-classes))
                      (pred-class (argmax-offset logits-data offset num-classes))
                      (true-class (inexact->exact 
                                  (f32vector-ref (tensor-data batch-targets) i))))
                 (when (= pred-class true-class)
                   (set! correct (+ correct 1))))))
           
           ;; Single backward pass for entire batch
           ;; Loss already computed with reduction='mean'
           (backward! loss)

           ;; Check gradient health
           (if gradient-diagnostics
               (record-step! monitor step params))

                  
           ;; Single optimizer step
           (step! optimizer)
           
           ;; Zero gradients for next batch
           (zero-grad-layer! model-layer)
           ))
       batches))

    (if gradient-diagnostics
        (begin
          (printf "\nTraining Diagnostics:\n")
          (let ((diagnosis (diagnose-training monitor)))
            (printf "Total steps: ~A\n" (cdr (assoc 'total-steps diagnosis)))
            (printf "Mean gradient norm: ~A\n" (cdr (assoc 'mean-gradient-norm diagnosis)))
            (printf "Unhealthy steps: ~A\n" (cdr (assoc 'unhealthy-steps diagnosis))))))
    
    (values (/ total-loss n) (/ correct n))))


(define (evaluate model test-data)
  "Evaluate model on test data"
  (let ((correct 0)
        (total (length test-data))
        (confusion (make-vector (* num-classes num-classes) 0)))
    
    (for-each
     (lambda (sample)
       (let* ((img-data (car sample))
              (true-class (cdr sample))
              (img (make-tensor32 img-data 
                                  (list num-channels image-size image-size)
                                  requires-grad?: #f))
              (logits (forward-cnn model img))
              (pred-data (tensor-data logits))
              (pred-class (argmax pred-data)))
         
         (when (= pred-class true-class)
           (set! correct (+ correct 1)))
         
         ;; Update confusion matrix
         (let ((idx (+ (* true-class num-classes) pred-class)))
           (vector-set! confusion idx (+ 1 (vector-ref confusion idx))))))
     test-data)
    
    (values (/ correct total) confusion)))

(define (print-confusion-matrix confusion)
  "Pretty print confusion matrix"
  (printf "\nConfusion Matrix:\n")
  (printf "         ")
  (do ((i 0 (+ i 1)))
      ((= i num-classes))
    (printf "Pred-~A  " i))
  (printf "\n")
  
  (do ((true-class 0 (+ true-class 1)))
      ((= true-class num-classes))
    (printf "True-~A  " true-class)
    (do ((pred-class 0 (+ pred-class 1)))
        ((= pred-class num-classes))
      (let ((idx (+ (* true-class num-classes) pred-class)))
        (printf "~A  " (vector-ref confusion idx))))
    (printf "\n")))


;;; ==================================================================
;;; Batched Evaluation
;;; ==================================================================

(define (evaluate-batched model test-data #!key (batch-size 64))
  "Evaluate model on test data using batched forward passes"
  (let ((correct 0)
        (total (length test-data))
        (confusion (make-vector (* num-classes num-classes) 0))
        (model-layer (car model)))
    
    ;; Split test data into batches
    (let ((batches (let loop ((remaining test-data)
                             (result '()))
                    (if (null? remaining)
                        (reverse result)
                        (let* ((batch-end (min batch-size (length remaining)))
                               (batch (take remaining batch-end))
                               (rest (drop remaining batch-end)))
                          (loop rest (cons batch result)))))))
      
      (for-each
       (lambda (batch)
         (let* ((actual-batch-size (length batch))
                
                ;; Stack batch (no gradients needed for eval)
                (batch-images (stack-images batch))
                (batch-targets (stack-targets batch))
                
                ;; Disable gradient tracking
                (batch-images (make-tensor32 (tensor-data batch-images)
                                             (tensor-shape batch-images)
                                             requires-grad?: #f))
                
                ;; Forward pass
                ;(conv-out (forward conv-layers batch-images))
                ;(batch-shape (tensor-shape conv-out))
                ;(N (car batch-shape))
                ;(flattened-size (apply * (cdr batch-shape)))
                ;(flat (reshape conv-out (list N flattened-size)))
                (logits (forward model-layer batch-images))
                (logits-data (tensor-data logits)))
           
           ;; Process predictions
           (do ((i 0 (+ i 1)))
               ((= i actual-batch-size))
             (let* ((offset (* i num-classes))
                    (pred-class (argmax-offset logits-data offset num-classes))
                    (true-class (inexact->exact 
                                (f32vector-ref (tensor-data batch-targets) i))))
               
               (when (= pred-class true-class)
                 (set! correct (+ correct 1)))
               
               ;; Update confusion matrix
               (let ((idx (+ (* true-class num-classes) pred-class)))
                 (vector-set! confusion idx 
                            (+ 1 (vector-ref confusion idx))))))))
       batches))
    
    (values (/ correct total) confusion)))

(define (argmax-offset vec offset length)
  "Find argmax in a slice of a vector starting at offset"
  (let loop ((i 1) (max-i 0) (max-val (f32vector-ref vec offset)))
    (if (= i length)
        max-i
        (let ((val (f32vector-ref vec (+ offset i))))
          (if (> val max-val)
              (loop (+ i 1) i val)
              (loop (+ i 1) max-i max-val))))))

;; Add this debugging version to your cnn.scm file
;; This will help identify where gradients are lost

(define (debug-forward-pass model batch-images batch-targets)
  "Forward pass with gradient flow debugging"
  (let ((conv-layers (cadr model))
        (dense-layers (caddr model)))
    
    (printf "\n=== Forward Pass Debug ===\n")
    
    ;; Step 1: Conv layers
    (printf "Input shape: ~A\n" (tensor-shape batch-images))
    (printf "Input requires-grad?: ~A\n" (tensor-requires-grad? batch-images))
    
    (let ((conv-out (forward conv-layers batch-images)))
      (printf "Conv output shape: ~A\n" (tensor-shape conv-out))
      (printf "Conv output requires-grad?: ~A\n" (tensor-requires-grad? conv-out))
      (printf "Conv output children count: ~A\n" 
              (length (tensor-children conv-out)))
      
      ;; Step 2: Reshape
      (let* ((batch-shape (tensor-shape conv-out))
             (N (car batch-shape))
             (flattened-size (apply * (cdr batch-shape)))
             (flat (reshape conv-out (list N flattened-size))))
        
        (printf "\nReshape:\n")
        (printf "  Flat shape: ~A\n" (tensor-shape flat))
        (printf "  Flat requires-grad?: ~A\n" (tensor-requires-grad? flat))
        ;(printf "  Flat has backward-fn?: ~A\n" 
        ;        (if (tensor-backward-fn flat) "YES" "NO"))
        (printf "  Flat children count: ~A\n" 
                (length (tensor-children flat)))
        (printf "  Flat children[0] == conv-out?: ~A\n"
                (if (null? (tensor-children flat))
                    "NO CHILDREN!"
                    (eq? (car (tensor-children flat)) conv-out)))
        
        ;; Step 3: Dense layers
        (let ((logits (forward dense-layers flat)))
          (printf "\nDense output:\n")
          (printf "  Logits shape: ~A\n" (tensor-shape logits))
          (printf "  Logits requires-grad?: ~A\n" (tensor-requires-grad? logits))
          ;(printf "  Logits has backward-fn?: ~A\n" 
          ;       (if (tensor-backward-fn logits) "YES" "NO"))
          
          ;; Step 4: Loss
          (let* ((probs (softmax logits axis: -1))
                 (loss (cross-entropy-loss probs batch-targets 
                                           reduction: 'mean
                                           from-logits: #f)))
            
            (printf "\nLoss:\n")
            (printf "  Loss value: ~A\n" (tensor-data loss))
            (printf "  Loss requires-grad?: ~A\n" (tensor-requires-grad? loss))
            
            ;; Step 5: Backward pass
            (printf "\n=== Backward Pass Debug ===\n")
            (backward! loss)
            
            (printf "After backward:\n")
            (printf "  Loss grad: ~A\n" (tensor-grad loss))
            (printf "  Logits grad sum: ~A\n" 
                    (if (tensor-grad logits)
                        (f32vector-fold + 0.0 (tensor-grad logits))
                        "NO GRAD"))
            (printf "  Flat grad sum: ~A\n" 
                    (if (tensor-grad flat)
                        (f32vector-fold + 0.0 (tensor-grad flat))
                        "NO GRAD"))
            (printf "  Conv-out grad sum: ~A\n" 
                    (if (tensor-grad conv-out)
                        (f32vector-fold + 0.0 (tensor-grad conv-out))
                        "NO GRAD"))
            
            ;; Check conv layer parameter gradients
            (printf "\nConv layer parameter gradients:\n")
            (let ((conv-params (parameters conv-layers)))
              (for-each
               (lambda (param idx)
                 (let ((grad (tensor-grad param)))
                   (printf "  Param ~A grad sum: ~A\n" 
                           idx
                           (if grad
                               (f32vector-fold + 0.0 grad)
                               "NO GRAD"))))
               conv-params
               (iota (length conv-params))))
            
            loss))))))

;; Use this to debug a single batch
(define (debug-single-batch)
  "Debug gradient flow on a single small batch"
  (set-random-seed! 42)
  
  (printf "Generating small test batch...\n")
  (let* ((test-batch (generate-dataset 2))  ; Just 2 samples per class = 8 total
         (small-batch (take test-batch 4))  ; Take 4 samples
         (model (build-cnn)))
    
    (printf "Building batch tensors...\n")
    (let ((batch-images (stack-images small-batch))
          (batch-targets (stack-targets small-batch)))
      
      (printf "Batch images shape: ~A\n" (tensor-shape batch-images))
      (printf "Batch targets shape: ~A\n" (tensor-shape batch-targets))
      
      ;; Run debug forward pass
      (debug-forward-pass model batch-images batch-targets))))

;;; ==================================================================
;;; Main Training Loop
;;; ==================================================================

(define (main)
  (printf "========================================\n")
  (printf "Convolutional Neural Network Example\n")
  (printf "========================================\n\n")
  
  ;; Set random seed for reproducibility
  (set-random-seed! 42)
  
  ;; Generate dataset
  (printf "Generating dataset...\n")
  (define train-data (generate-dataset 250))  ; 250 samples per class = 1000 total
  (define test-data (generate-dataset 50))     ; 50 samples per class = 200 total
  (printf "Training samples: ~A\n" (length train-data))
  (printf "Test samples: ~A\n\n" (length test-data))
  
  ;; Print class distribution
  (printf "Training set class distribution:\n")
  (let ((counts (make-vector num-classes 0)))
    (for-each
     (lambda (sample)
       (let ((class (cdr sample)))
         (vector-set! counts class (+ 1 (vector-ref counts class)))))
     train-data)
    (do ((i 0 (+ i 1)))
        ((= i num-classes))
      (printf "  Class ~A: ~A samples\n" i (vector-ref counts i))))
  (printf "\n")
  
  ;; Build model
  (printf "Building CNN model...\n")
  (define model (build-cnn))
  (define model-layer (car model))
  (define conv-layers (make-sequential (cadr model) name: "Conv-Layers"))
  (define dense-layers (make-sequential (caddr model) name: "Dense-Layers"))
  
  (printf "\nModel Architecture:\n")
  (printf "  Convolutional Layers:\n")
  (let ((conv-params (parameters conv-layers )))
    (printf "    Parameters: ~A\n" 
            (fold (lambda (p acc)
                    (+ acc (f32vector-length (tensor-data p))))
                  0 conv-params)))
  (printf "  Dense Layers:\n")
  (let ((dense-params (parameters dense-layers)))
    (printf "    Parameters: ~A\n"
            (fold (lambda (p acc)
                    (+ acc (f32vector-length (tensor-data p))))
                  0 dense-params)))
  (printf "  Total Parameters: ~A\n"
          (+ (fold (lambda (p acc) (+ acc (f32vector-length (tensor-data p))))
                   0 (parameters conv-layers))
             (fold (lambda (p acc) (+ acc (f32vector-length (tensor-data p))))
                   0 (parameters dense-layers))))
  (printf "\n")
  
  ;; Create optimizer
  (define learning-rate 0.001)
  (printf "Optimizer: Adam (lr=~A)\n\n" learning-rate)
  (define optimizer (make-adam (parameters model-layer)
                               learning-rate: learning-rate
                               weight-decay: 0.0001))
  
  ;; Training loop
  (define num-epochs 20)
  (define best-acc 0.0)

  (printf "Training for ~A epochs...\n" num-epochs)
  (printf "----------------------------------------\n")
  
  (do ((epoch 1 (+ epoch 1)))
      ((> epoch num-epochs))
    
    ;; Train
    (let-values (((avg-loss accuracy)
                  ;(train-epoch model optimizer train-data
                  ;                     )))
                  (train-epoch-batched model optimizer train-data
                                       batch-size: 64 gradient-diagnostics: (= epoch 1))))
      (printf "Epoch ~A/~A - Loss: ~A - Acc: ~A"
              epoch num-epochs avg-loss (* 100.0 accuracy))
      
      ;; Evaluate every 5 epochs
      (when (= (modulo epoch 5) 0)
        (let-values (((test-acc confusion) (evaluate-batched model test-data batch-size: 64)))
          (printf " - Test Acc: ~A" (* 100.0 test-acc))
          ;; Save checkpoint if best so far
          (when (> test-acc best-acc)
            (set! best-acc test-acc)
            (printf "\n  New best accuracy! Saving checkpoint...")
            (save-checkpoint model optimizer epoch avg-loss accuracy
                             (sprintf "best-cnn-model_~A.ngrd" epoch)))))
          
      
      (printf "\n"))
    
    ;; Learning rate decay
    (when (= (modulo epoch 10) 0)
      (let ((new-lr (* (get-learning-rate optimizer) 0.5)))
        (set-learning-rate! optimizer new-lr)
        (printf "  - Learning rate decreased to ~A\n" new-lr))))
  
  (printf "----------------------------------------\n\n")

  ;; Save final model
  (printf "Saving final model...\n")
  (save-cnn-model model "final-cnn-model.ngrd")
  (printf "\n")
  
  ;; Final evaluation
  (printf "Final Evaluation on Test Set:\n")
  (let-values (((test-acc confusion) (evaluate model test-data)))
    (printf "Test Accuracy: ~A\n" (* 100.0 test-acc))
    (print-confusion-matrix confusion))
  
  (printf "\n")
  
  ;; Per-class accuracy
  (printf "\nPer-Class Accuracy:\n")
  (let-values (((test-acc confusion) (evaluate model test-data)))
    (do ((class 0 (+ class 1)))
        ((= class num-classes))
      (let ((total 0)
            (correct 0))
        (do ((pred 0 (+ pred 1)))
            ((= pred num-classes))
          (let ((idx (+ (* class num-classes) pred)))
            (set! total (+ total (vector-ref confusion idx)))
            (when (= class pred)
              (set! correct (vector-ref confusion idx)))))
        (printf "  Class ~A: ~A (~A/~A)\n" 
                class
                (* 100 (/ correct total))
                correct
                total))))
  
  (printf "\n")
  
  ;; Test on individual samples
  (printf "Sample Predictions:\n")
  (do ((i 0 (+ i 1)))
      ((= i 200))
    (let* ((sample (list-ref test-data i))
           (img-data (car sample))
           (true-class (cdr sample))
           (img (make-tensor32 img-data 
                              (list num-channels image-size image-size)
                              requires-grad?: #f))
           (logits (forward-cnn model img))
           (probs (softmax logits))
           (pred-data (tensor-data probs))
           (pred-class (argmax pred-data)))
      
      (printf "  Sample ~A: True=~A, Pred=~A " (+ i 1) true-class pred-class)
      (if (= pred-class true-class)
          (printf "O")
          (printf "X"))
      (printf " (confidence: ~A)\n" 
              (* 100 (f32vector-ref pred-data pred-class)))))
  
  (printf "\n========================================\n")
  (printf "Training Complete!\n")
  (printf "========================================\n"))

;;; ==================================================================
;;; Model Analysis
;;; ==================================================================

(define (analyze-model-weights model)
  "Analyze weight statistics for each layer"
  (printf "\n========================================\n")
  (printf "Model Weight Analysis\n")
  (printf "========================================\n\n")
  
  (let ((params (parameters model))
        (layer-names '("Conv1-W" "Conv1-b" "Conv2-W" "Conv2-b" 
                       "Conv3-W" "Conv3-b" "FC1-W" "FC1-b" 
                       "FC2-W" "FC2-b")))
    
    (for-each
     (lambda (param name)
       (let* ((data (tensor-data param))
              (dtype (tensor-dtype param))
              (n (vector-length-for-dtype data dtype))
              (sum 0.0)
              (sum-sq 0.0)
              (min-val +inf.0)
              (max-val -inf.0))
         
         ;; Compute statistics
         (do ((i 0 (+ i 1)))
             ((= i n))
           (let ((val (case dtype
                       ((f32) (f32vector-ref data i))
                       ((f64) (f64vector-ref data i)))))
             (set! sum (+ sum val))
             (set! sum-sq (+ sum-sq (* val val)))
             (set! min-val (min min-val val))
             (set! max-val (max max-val val))))
         
         (let* ((mean (/ sum n))
                (variance (- (/ sum-sq n) (* mean mean)))
                (stddev (sqrt (max 0.0 variance))))
           
           (printf "~A:\n" name)
           (printf "  Shape: ~A\n" (tensor-shape param))
           (printf "  Mean:  ~8,5f\n" mean)
           (printf "  Std:   ~8,5f\n" stddev)
           (printf "  Min:   ~8,5f\n" min-val)
           (printf "  Max:   ~8,5f\n" max-val)
           (printf "\n"))))
     params
     layer-names)))

;;; ==================================================================
;;; Model Persistence
;;; ==================================================================

(define (save-cnn-model model filepath)
  "Save CNN model to file
   
   Args:
     model: The model tuple (conv-layers, dense-layers, conv-layers-internal)
     filepath: Path where to save the model
   
   The model is saved as a two-layer sequential (conv + dense)"
  
  (let ((model-layer (car model))
        (conv-layers (make-sequential (cadr model) name: "Conv-Layers"))
        (dense-layers (make-sequential (caddr model) name: "Dense-Layers")))
    
    (printf "Saving model to ~A...\n" filepath)
    
      ;; Save using the built-in serialization
      (save-model model-layer filepath)
      
      (printf "Model saved successfully!\n")
      (printf "  Conv layers parameters: ~A\n" 
              (length (parameters conv-layers)))
      (printf "  Dense layers parameters: ~A\n" 
              (length (parameters dense-layers)))
      ))

(define (load-cnn-model filepath)
  "Load CNN model from file
   
   Args:
     filepath: Path to the saved model
   
   Returns:
     Model tuple compatible with forward-cnn and train functions"
  
  (printf "Loading model from ~A...\n" filepath)
  
  ;; Load the sequential model
  (let* ((full-model (load-model filepath))
         (ser (layer->serializable full-model))
         (layer-type (assq 'type ser))
         (layer-list (if (eq? (and layer-type (cdr layer-type)) 'sequential)
                         (assq 'layers ser) '(layers)))
         (layer-objects (map serializable->layer (cdr layer-list)))
         (conv-layers (filter conv2d-layer? layer-objects))
         (dense-layers (filter dense-layer? layer-objects)))

    (printf "Model loaded successfully!\n")
    
    ;; The loaded model is a sequential containing conv and dense layers
    ;; We need to extract them to match the expected model structure
    
    (list full-model
          conv-layers
          dense-layers)))

;;; ==================================================================
;;; Model Checkpointing
;;; ==================================================================

(define (save-checkpoint model optimizer epoch train-loss train-acc filepath)
  "Save training checkpoint including model, optimizer state, and metrics"
   
  (printf "\nSaving checkpoint at epoch ~A...\n" epoch)
  (save-cnn-model model filepath)
  
  (let ((metadata-file (string-append filepath ".meta")))
    (with-output-to-file metadata-file
      (lambda ()
        (printf "epoch: ~A\n" epoch)
        (printf "train-loss: ~A\n" train-loss)
        (printf "train-acc: ~A\n" train-acc)
        (printf "timestamp: ~A\n" (current-seconds)))))
  
  (printf "Checkpoint saved!\n"))

(define (load-checkpoint filepath)
  "Load a training checkpoint
   
   Returns: (model metadata)"
  
  (printf "Loading checkpoint from ~A...\n" filepath)
  
  (let ((model (load-cnn-model filepath))
        (metadata-file (string-append filepath ".meta")))
    
    ;; Load metadata if it exists
    (let ((metadata 
           (if (file-exists? metadata-file)
               (with-input-from-file metadata-file
                 (lambda ()
                   (let ((lines '()))
                     (let loop ((line (read-line)))
                       (unless (eof-object? line)
                         (set! lines (cons line lines))
                         (loop (read-line))))
                     (reverse lines))))
               '())))
      
      (when (not (null? metadata))
        (printf "\nCheckpoint metadata:\n")
        (for-each (lambda (line) (printf "  ~A\n" line)) metadata))
      
      (values model metadata))))

;;; ==================================================================
;;; Advanced Training: Data Augmentation
;;; ==================================================================

(define (augment-image img-data)
  "Apply random augmentations to image"
  (let ((aug-img (make-f32vector (f32vector-length img-data))))
    
    ;; Copy original
    (do ((i 0 (+ i 1)))
        ((= i (f32vector-length img-data)))
      (f32vector-set! aug-img i (f32vector-ref img-data i)))
    
    ;; Random horizontal flip (50% chance)
    (when (< (pseudo-random-real) 0.5)
      (do ((y 0 (+ y 1)))
          ((= y image-size))
        (do ((x 0 (+ x 1)))
            ((< x (quotient image-size 2)))
          (let ((idx1 (+ (* y image-size) x))
                (idx2 (+ (* y image-size) (- image-size 1 x)))
                (tmp (f32vector-ref aug-img idx1)))
            (f32vector-set! aug-img idx1 (f32vector-ref aug-img idx2))
            (f32vector-set! aug-img idx2 tmp)))))
    
    ;; Random brightness adjustment
    (let ((brightness-delta (* 0.2 (- (pseudo-random-real) 0.5))))
      (do ((i 0 (+ i 1)))
          ((= i (f32vector-length aug-img)))
        (f32vector-set! aug-img i
                        (max 0.0 
                             (min 1.0 
                                  (+ (f32vector-ref aug-img i) 
                                     brightness-delta))))))
    
    aug-img))

(define (train-epoch-with-augmentation model optimizer train-data)
  "Train epoch with data augmentation"
  (let ((total-loss 0.0)
        (correct 0)
        (n (length train-data)))
    
    (for-each
     (lambda (sample)
       (let* ((orig-img-data (car sample))
              (true-class (cdr sample))
              (aug-img-data (augment-image orig-img-data))
              (img (make-tensor32 aug-img-data 
                                  (list num-channels image-size image-size)))
              (target (make-tensor32 (one-hot true-class num-classes)
                                    (list num-classes)))
              
              (logits (forward model img))
              (probs (softmax logits))
              (loss (cross-entropy-loss probs target)))
         
         (set! total-loss (+ total-loss 
                            (f32vector-ref (tensor-data loss) 0)))
         
         (let* ((pred-data (tensor-data logits))
                (pred-class (argmax pred-data)))
           (when (= pred-class true-class)
             (set! correct (+ correct 1))))

         ;; Reset loss gradient before backward 
         (let ((loss-grad (tensor-grad loss)))
           (f32vector-set! loss-grad 0 1.0))
         
         (backward! loss)

         
         (step! optimizer)
         (zero-grad-layer! model)))
     train-data)
    
    (values (/ total-loss n) (/ correct n))))

;; Run the basic example
(printf "\n")
(printf "  NanoGrad CNN Example                  \n")
(printf "  Image Classification with Conv Layers \n")
(printf "\n")

(main)

(define loaded-model (load-cnn-model "final-cnn-model.ngrd"))