;; Data frame implementation for CHICKEN Scheme. ;; ;; Copyright 2019 Ivan Raikov. ;; ;; Inspired by the various data frame implementations found in R, ;; Python, and Racket. ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the Lesser GNU General Public License as ;; published by the Free Software Foundation, either version 3 of the ;; License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; . ;; (module dataframe (column? column-key column-collection column-properties column-deserialize column-serialize data-frame? make-data-frame df-column-properties df-collection df-column df-columns df-keys df-items df-row-count df-filter-columns df-select-columns df-insert-column df-insert-derived df-insert-columns df-update-column df-delete-column df-for-each-column df-for-each-collection df-gen-columns df-gen-rows df-serialize df-deserialize df-from-rows apply-collections map-columns map-collections reduce-collections display.max-elements display.max-columns compare-symbol compare-int) (import scheme (chicken base) (chicken string) (chicken format) (prefix (only srfi-1 fold any) list.) (only srfi-69 symbol-hash) srfi-127 yasos yasos-collections rb-tree fmt fmt-table) (define display.max-elements (make-parameter 20)) (define display.max-columns (make-parameter 10)) (define *eof-object* (read (open-input-string ""))) (define (eof-object) *eof-object*) (define (list-take* lst n) (let recur ((lst lst) (k n)) (if (or (null? lst) (eq? 0 k)) '() (cons (car lst) (recur (cdr lst) (- k 1)))))) (define (enumerate lst) (let recur ((lst lst) (i 0) (ax '())) (if (null? lst) (reverse ax) (recur (cdr lst) (+ 1 i) (cons i ax))) )) ;; Helper returns #t if any element of list is null or #f if none (define (any-null? list) (cond ((null? list) #f) ((null? (car list)) #t) (else (any-null? (cdr list))))) ;; Safe version of lseq-rest that returns () if the argument is () (define (safe-lseq-rest obj) (if (null? obj) obj (lseq-rest obj))) (define (lseq-map-generator proc . lseqs) (let ((lseqsp (make-parameter lseqs))) (lambda () (let ((lseqsv (lseqsp))) (if (any-null? lseqsv) (eof-object) (let ((result (apply proc (map lseq-first lseqsv)))) (lseqsp (map safe-lseq-rest lseqsv)) result)))) )) (define-predicate column?) (define-operation (column-properties column)) (define-operation (column-key column)) (define-operation (column-collection column)) (define-operation (column-deserialize column port)) (define-operation (column-serialize column port)) (define-record-type data-column (make-data-column key collection properties) data-column? (key data-column-key set-data-column-key! ) (collection data-column-collection set-data-column-collection! ) (properties data-column-properties set-data-column-properties! ) ) (define-record-type derived-column (make-derived-column key properties parent collection) derived-column? (key derived-column-key) (properties derived-column-properties) (parent derived-column-parent) (collection derived-column-collection) ) (define-operation (column? ) (cond ((data-column? ) #t) ((derived-column? ) #t) (else #f) )) (define-operation (column-key ) (cond ((data-column? ) (data-column-key )) ((derived-column? ) (derived-column-key )) (else (error "operation not supported: column-key ")) )) (define-operation (column-properties ) (cond ((data-column? ) (data-column-properties )) ((derived-column? ) (derived-column-properties )) (else (error "operation not supported: column-properties ")) )) (define-operation (column-collection ) (cond ((data-column? ) (data-column-collection )) ((derived-column? ) (derived-column-collection )) (else (error "operation not supported: column-collection ")) )) (define-operation (column-deserialize port) (cond ((data-column? ) (let* ((data (read port)) (k (car data)) (p (cadr data)) (x (caddr data))) (set-data-column-key! k) (set-data-column-properties! p) (set-data-column-collection! x) )) (else (error "operation not supported: column-deserialize")) )) (define-operation (column-serialize port) (let ((k (column-key )) (p (column-properties )) (c (column-collection ))) (display #\( port) (write k port) (display #\space port) (write p port) (display #\space port) (display #\( port) (for-each-elt (lambda (x) (write x port) (display #\space port)) c) (display #\) port) (display #\) port) )) (define (make-derived-collection dproc c) (let ( (parent c) ) (object ;; collection behaviors ((collection? self) #t) ((size self) (size parent)) ((gen-keys self) (gen-keys parent)) ((gen-elts self) (g-map dproc (gen-elts parent))) ((for-each-key self proc) (for-each-key parent proc)) ((for-each-elt self proc) (for-each-elt parent (lambda (item) (proc (dproc item))))) ((elt-take self n) (map-elts (lambda (item) (dproc item)) (elt-take parent n) )) ((elt-drop self n) (map-elts (lambda (item) (dproc item)) (elt-drop parent n) )) )) ) (define (collection-from-rows source n) (let ( (accessor (lambda (c) (elt-ref c n))) (parent (if (procedure? source) (generator->list source) source)) ) (object ;; collection behaviors ((collection? self) #t) ((size self) (size parent)) ((gen-keys self) (gen-keys parent)) ((gen-elts self) (g-map accessor (gen-elts parent))) ((for-each-key self proc) (for-each-key parent proc)) ((for-each-elt self proc) (for-each-elt parent (lambda (item) (proc (accessor item))))) ((elt-take self n) (map-elts (lambda (item) (accessor item)) (elt-take parent n) )) ((elt-drop self n) (map-elts (lambda (item) (accessor item)) (elt-drop parent n) )) )) ) (define-predicate data-frame?) (define-operation (df-column-properties data-frame key failure-object)) (define-operation (df-collection data-frame key failure-object)) (define-operation (df-column data-frame key failure-object)) (define-operation (df-columns frame)) (define-operation (df-keys frame)) (define-operation (df-items frame)) (define-operation (df-row-count frame)) (define-operation (df-filter-columns frame proc)) (define-operation (df-select-columns frame keys)) (define-operation (df-insert-column data-frame key collection properties)) (define-operation (df-insert-derived data-frame parent-key key proc properties)) (define-operation (df-insert-columns data-frame lseq)) (define-operation (df-update-column data-frame key collection properties)) (define-operation (df-delete-column frame key)) (define-operation (df-for-each-column data-frame proc)) (define-operation (df-for-each-collection data-frame proc)) (define-operation (df-gen-collections data-frame)) (define-operation (df-gen-columns data-frame)) (define-operation (df-gen-rows data-frame)) (define-operation (df-serialize data-frame port)) (define-operation (df-deserialize data-frame port)) (define (compare-symbol x y) (- (symbol-hash x) (symbol-hash y))) (define (compare-int x y) (- x y)) (define (column-generator cols) (let ((keys (lseq-map column-key cols))) (lseq-map-generator cons keys cols) )) (define (row-generator cols) (let ((gcolls (lseq->list (lseq-map (compose (lambda (x) (if (collection? x) (gen-elts x) (gen-elts (list x)))) column-collection) cols)))) (lambda () (let ((row (map (lambda (g) (g)) gcolls))) (if (list.any eof-object? row) (eof-object) row)) )) ) (define (make-data-frame #!key (column-key-compare compare-symbol) (new-column-map (rb-tree-map column-key-compare))) (object-with-ancestors ( (column-map new-column-map) ) ((data-frame? self) #t) ((df-column-properties self key failure-object) (let ((cp ((operate-as column-map get/default) self key #f))) (if cp (column-properties cp) failure-object))) ((df-collection self key failure-object) (let ((cp ((operate-as column-map get/default) self key #f))) (if cp (column-collection cp) failure-object))) ((df-column self key failure-object) ((operate-as column-map get/default) self key failure-object)) ((df-columns self) (generator->lseq ((operate-as column-map gen-elts) self))) ((df-keys self) (generator->lseq ((operate-as column-map gen-keys) self))) ((df-items self) (generator->lseq (g-zip ((operate-as column-map gen-keys) self) ((operate-as column-map gen-elts) self)))) ((df-filter-columns self proc) (lseq-filter (lambda (col) (proc (column-key col) (column-properties col))) (generator->lseq ((operate-as column-map gen-elts) self)))) ((df-select-columns self keys) (lseq-filter (lambda (col) (member (column-key col) keys)) (generator->lseq ((operate-as column-map gen-elts) self)))) ((df-insert-column self key collection properties) (let* ((m1 ((operate-as column-map put) self key (make-data-column key collection properties)))) (make-data-frame new-column-map: m1))) ((df-insert-derived self parent-key key proc properties) (let ((cmap (list.fold (lambda (parent cmap) (let* ((collection (make-derived-collection proc (column-collection (cdr parent)))) (cmap1 (put cmap key (make-derived-column key properties parent collection)))) cmap1)) column-map (list (df-column self parent-key #f)) ))) (make-data-frame new-column-map: cmap))) ((df-insert-columns self cols) (let recur ((cols cols) (m column-map)) (if (null? cols) (make-data-frame new-column-map: m) (let* ((col (lseq-first cols)) (m1 ((operate-as column-map put) self (column-key col) col))) (recur (lseq-rest cols) m1))) )) ((df-update-column self key collection properties) (let ((m1 ((operate-as column-map update) self key (make-data-column key collection properties)))) (make-data-frame new-column-map: m1))) ((df-delete-column self key) (make-data-frame new-column-map: ((operate-as column-map delete) self key))) ((df-for-each-column self proc) ((operate-as column-map for-each-ascending) self (lambda (item) (proc item)))) ((df-for-each-collection self proc) ((operate-as column-map for-each-ascending) (lambda (item) (proc (column-collection item))))) ((df-gen-rows self) (let ((cols (df-columns self))) (row-generator cols))) ((df-gen-columns self) (let ((cols (df-columns self))) (column-generator cols))) ((df-row-count self) (let* ((cols (df-columns self)) (num-elements (size (column-collection (car cols))))) num-elements)) ((df-serialize self port) (let* ((cols (df-columns self)) (keys (lseq->list (lseq-map column-key cols)))) (write keys port) (df-for-each-column self (lambda (item) (column-serialize (cdr item) port))) )) ((df-deserialize self port) (let* ((keys (read port))) (let ((m1 (list.fold (lambda (key m) (let ((c (make-data-column key '() '()))) (column-deserialize c port) (put m key c))) column-map keys))) (make-data-frame new-column-map: m1)) )) ((show self port) (let* ((cols (df-columns self)) (keys (lseq->list (lseq-map column-key cols))) (num-elements (size (column-collection (car cols)))) (num-cols (length cols)) (show-keys (list-take* keys (display.max-columns))) (show-cols (if (> (display.max-elements) 0) (list-take* (lseq->list (lseq-map (compose (lambda (c) (elt-take c (display.max-elements))) column-collection) cols)) (display.max-columns)) (list-take* (lseq->list (lseq-map column-collection cols)) (display.max-columns))) )) (fmt (or port (current-output-port)) (fmt-table show-keys show-cols)) (if (and (> (display.max-elements) 0) (> num-elements (display.max-elements))) (fprintf (or port (current-output-port)) "(~A more elements ...)~%" (- num-elements (display.max-elements)))) (if (and (> (display.max-columns) 0) (> num-cols (display.max-columns))) (fprintf (or port (current-output-port)) "(~A more columns ...)~%" (- num-cols (display.max-columns)))) )) ) ) (define (df-from-rows column-keys source #!key (column-key-compare compare-symbol)) (let* ((column-map (rb-tree-map column-key-compare)) (m1 (list.fold (lambda (n key m) (let* ((data (collection-from-rows source n)) (c (make-data-column key data '()))) (put m key c))) column-map (enumerate column-keys) column-keys))) (make-data-frame column-key-compare: column-key-compare new-column-map: m1)) ) (define (apply-collections proc df . keys) (let ((ks (if (null? keys) (lseq->list (df-keys df)) keys))) (let ((colls (map (lambda (k) (column-collection (cdr (df-column df k #f)))) ks))) (apply proc colls)))) (define (apply-columns proc df . keys) (let ((ks (if (null? keys) (lseq->list (df-keys df)) keys))) (let ((cols (map (lambda (k) (df-column df k #f)) ks))) (apply proc cols)))) (define (map-columns proc df #!key (keys #f)) (let ((ks (or keys (lseq->list (df-keys df))))) (let recur ((res (make-data-frame)) (ks ks)) (if (null? ks) res (let ((col (cdr (df-column df (car ks) #f)))) (recur (df-insert-column res (column-key col) (proc col) '()) (cdr ks)) )) )) ) (define (map-collections proc df #!key (keys #f)) (let ((ks (or keys (lseq->list (df-keys df))))) (let recur ((res (make-data-frame)) (ks ks)) (if (null? ks) res (let ((col (cdr (df-column df (car ks) #f)))) (recur (df-insert-column res (column-key col) (proc (column-collection col)) '()) (cdr ks)) )) )) ) (define (reduce-collections proc df seed #!key (keys #f) ) (let ((ks (or keys (df-keys df)))) (let ((colls (map (lambda (k) (column-collection (cdr (df-column df k #f)))) ks))) (let recur ((colls colls) (ax seed)) (if (list.any null? colls) ax (let ((val (apply proc ax (map lseq-first colls)))) (recur (map lseq-rest colls) val)) )) )) ) )