;;; @Package csv ;;; @Subtitle Comma-Separated Value (CSV) Utilities in Scheme ;;; @HomePage http://www.neilvandyke.org/csv-scheme/ ;;; @Author Neil Van Dyke ;;; @Version 0.10 ;;; @Date 2010-04-13 ;;; @PLaneT neil/csv:1:6 ;; $Id: csv.ss,v 1.199 2010/04/13 17:56:20 neilpair Exp $ ;;; @legal ;;; Copyright @copyright{} 2004--2009 Neil Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 3 of the License (LGPL 3), or (at your option) ;;; any later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See ;;; @indicateurl{http://www.gnu.org/licenses/} for details. For other licenses ;;; and consulting, please contact the author. ;;; @end legal ;#lang scheme/base ;;; @section Introduction ;;; The @b{csv} Scheme library provides utilities for reading various kinds of ;;; what are commonly known as ``comma-separated value'' (CSV) files. Since ;;; there is no standard CSV format, this library permits CSV readers to be ;;; constructed from a specification of the peculiarities of a given variant. ;;; A default reader handles the majority of formats. ;;; ;;; One of the main uses of this library is to import data from old crusty ;;; legacy applications into Scheme for data conversion and other processing. ;;; To that end, this library includes various conveniences for iterating over ;;; parsed CSV rows, and for converting CSV input to the ;;; @uref{http://pobox.com/~oleg/ftp/Scheme/SXML.html, SXML 3.0} Scheme XML ;;; format. ;;; ;;; This library requires R5RS, SRFI-6, SRFI-23, and an @code{integer->char} ;;; procedure that accepts ASCII values. ;;; ;;; Other implementations of some kind of CSV reading for Scheme include ;;; Gauche's @code{text.csv} module, and Scsh's @code{record-reader} and ;;; related procedures. This library intends to be portable and more ;;; comprehensive. ;; TODO: Briefly introduce terms "row", "column", and "field". (define-syntax %csv:error (syntax-rules () ((_ p m o) (error (string-append p " : " m) o) ;; Bigloo: (error p m o) ))) (define-syntax %csv:type-error (syntax-rules () ((_ proc-str expected-str got-value) (%csv:error proc-str (string-append "expected " expected-str ", received:") got-value)))) (define %csv:a2c integer->char) (define %csv:cr (%csv:a2c 13)) (define %csv:lf (%csv:a2c 10)) (define-syntax %csv:gosc (syntax-rules () ((_ os-stx) (let* ((os os-stx) (str (get-output-string os))) (close-output-port os) str)))) (define (%csv:in-arg proc-name in) (cond ((input-port? in) in) ((string? in) (open-input-string in)) (else (%csv:type-error proc-name "input port or string" in)))) (define (%csv:reader-or-in-arg proc-name reader-or-in) (cond ((procedure? reader-or-in) reader-or-in) ((input-port? reader-or-in) (make-csv-reader reader-or-in)) ((string? reader-or-in) (make-csv-reader (open-input-string reader-or-in))) (else (%csv:type-error proc-name "csv reader or input port or string" reader-or-in)))) ;;; @section Reader Specs ;;; CSV readers are constructed using @dfn{reader specs}, which are sets of ;;; attribute-value pairs, represented in Scheme as association lists keyed on ;;; symbols. Each attribute has a default value if not specified otherwise. ;;; The attributes are: ;;; @table @code ;;; ;;; @item newline-type ;;; Symbol representing the newline, or record-terminator, convention. The ;;; convention can be a fixed character sequence (@code{lf}, @code{crlf}, or ;;; @code{cr}, corresponding to combinations of line-feed and carriage-return), ;;; any string of one or more line-feed and carriage-return characters ;;; (@code{lax}), or adaptive (@code{adapt}). @code{adapt} attempts to detect ;;; the newline convention at the start of the input and assume that convention ;;; for the remainder of the input. Default: @code{lax} ;;; ;;; @item separator-chars ;;; Non-null list of characters that serve as field separators. Normally, this ;;; will be a list of one character. Default: @code{(#\,)} (list of the comma ;;; character) ;;; ;;; @item quote-char ;;; Character that should be treated as the quoted field delimiter character, ;;; or @code{#f} if fields cannot be quoted. Note that there can be only one ;;; quote character. Default: @code{#\"} (double-quote) ;;; ;;; @item quote-doubling-escapes? ;;; Boolean for whether or not a sequence of two @code{quote-char} quote ;;; characters within a quoted field constitute an escape sequence for ;;; including a single @code{quote-char} within the string. Default: @code{#t} ;;; ;;; @item comment-chars ;;; List of characters, possibly null, which comment out the entire line of ;;; input when they appear as the first character in a line. Default: ;;; @code{()} (null list) ;;; ;;; @item whitespace-chars ;;; List of characters, possibly null, that are considered @dfn{whitespace} ;;; constituents for purposes of the @code{strip-leading-whitespace?} and ;;; @code{strip-trailing-whitespace?} attributes described below. ;;; Default: @code{(#\space)} (list of the space character) ;;; ;;; @item strip-leading-whitespace? ;;; Boolean for whether or not leading whitespace in fields should be ;;; stripped. Note that whitespace within a quoted field is never stripped. ;;; Default: @code{#f} ;;; ;;; @item strip-trailing-whitespace? ;;; Boolean for whether or not trailing whitespace in fields should be ;;; stripped. Note that whitespace within a quoted field is never stripped. ;;; Default: @code{#f} ;;; ;;; @item newlines-in-quotes? ;;; Boolean for whether or not newline sequences are permitted within quoted ;;; fields. If true, then the newline characters are included as part of the ;;; field value; if false, then the newline sequence is treated as a premature ;;; record termination. Default: @code{#t} ;;; ;;; @end table ;; TODO: Do not expose this procedure for now. We expect it to go away and be ;; replaced with two other procedures. ;; ;; @defproc %csv:csv-spec-derive orig-spec changes ;; ;; Yields a new CSV spec that is derived from @var{orig-spec} by applying spec ;; @var{changes} as attribute substitions and additions to the original. For ;; example, given an original CSV reader spec: ;; ;; @lisp ;; (define my-first-csv-spec ;; '((newline-type . lax) ;; (separator-chars . (#\,)) ;; (quote-char . #\") ;; (quote-doubling-escapes? . #t) ;; (whitespace-chars . (#\space)))) ;; @end lisp ;; ;; a derived spec with a different @code{separator-chars} attribute and an ;; added @code{comment-chars} attribute can be created like: ;; ;; @lisp ;; (%csv:csv-spec-derive my-first-csv-spec ;; '((separator-chars . (#\%)) ;; (comment-chars . (#\#)))) ;; @result{} ;; ((separator-chars . (#\%)) ;; (comment-chars . (#\#)) ;; (newline-type . lax) ;; (quote-char . #\") ;; (quote-doubling-escapes? . #t) ;; (whitespace-chars . (#\space))) ;; @end lisp ;; ;; In that the yielded spec might share some structure with @var{orig-spec} ;; and/or @var{changes}. Most applications will not use this procedure ;; directly. (define (%csv:csv-spec-derive orig-spec changes) ;; TODO: Make this not share structure. Error-check and normalize at the ;; same time we clone. (let ((new-spec '())) (let ((add-to-new-spec (lambda (alist) (for-each (lambda (cell) (or (assq (car cell) new-spec) (set! new-spec (cons cell new-spec)))) alist)))) (add-to-new-spec changes) (add-to-new-spec orig-spec) (reverse new-spec)))) ;;; @section Making Reader Makers ;;; CSV readers are procedures that are constructed dynamically to close over a ;;; particular CSV input and yield a parsed row value each time the procedure ;;; is applied. For efficiency reasons, the reader procedures are themselves ;;; constructed by another procedure, @code{make-csv-reader-maker}, for ;;; particular CSV reader specs. (define (%csv:csv-error code extra) ;; TODO: Maybe make the CSV error handler user-specifiable, or allow user to ;; specify some errors that should be disregarded. ;; ;; TODO: Add position information. Keep track of character position while ;; reading. (%csv:error "[csv-reader]" (string-append "Erroneous CSV format: " (case code ((junk-after-quote-close) "Junk after close of quoted field:") (else (string-append "INTERNAL ERROR: Unknown code: " (symbol->string code))))) extra)) (define (%csv:newline-check-step0 newline-type c port) ;; (display "*DEBUG* (equal? newline-type 'lax) = ") ;; (write (equal? newline-type 'lax)) ;; (newline) ;; (display "*DEBUG* (eqv? newline-type 'lax) = ") ;; (write (eqv? newline-type 'lax)) ;; (newline) (case newline-type ((cr) (eqv? c %csv:cr)) ((lf) (eqv? c %csv:lf)) ((crlf) (if (eqv? c %csv:cr) (let ((c2 (peek-char port))) (cond ((eof-object? c2) ;; Note: This is a CR-EOF in an input that uses CR-LF ;; for terminating records. We are discarding the CR, ;; so it will not be added to the field string. We ;; might want to signal an error. #t) ((eqv? c2 %csv:lf) (read-char port) #t) (else #f))) #f)) ((lax detect) (cond ((eqv? c %csv:cr) (let ((c2 (peek-char port))) (cond ((eof-object? c2) #t) ((eqv? c2 %csv:lf) (read-char port) 'crlf) (else 'cr)))) ((eqv? c %csv:lf) 'lf) (else #f))) (else (%csv:error "%csv:make-portreader/positional" "unrecognized newline-type" newline-type)))) (define %csv:make-portreader/positional (letrec-syntax ((newline-check (syntax-rules () ((_ newline-type c port detected-newline-type) ;; Note: "port" and "detected-newline-type" must be identifiers. ;; "newline-type" and "c" must be identifiers or self-evals. (if (eqv? newline-type 'detect) (begin (set! detected-newline-type (%csv:newline-check-step0 newline-type c port)) detected-newline-type) (%csv:newline-check-step0 newline-type c port))))) (gosc-cons ;; Note: This is to ensure the output string is gotten and closed ;; before consing it with the result of a recursive call. (syntax-rules () ((_ os b) (let ((s (%csv:gosc os))) (cons s b)))))) (lambda (newline-type separator-chars quote-char quote-doubling-escapes? comment-chars whitespace-chars strip-leading-whitespace? strip-trailing-whitespace? newlines-in-quotes?) (lambda (port) (let ((dnlt #f) (escape-char #\\)) (let read-fields-or-eof ((c (read-char port))) (cond ((eof-object? c) '()) ((and strip-leading-whitespace? (memv c whitespace-chars)) ;; It's leading whitespace char when we're ignoring leading ;; whitespace in fields, and there might just be whitespace and ;; then an EOF, which should probably be considered just an EOF ;; rather than a row with one empty field, so just skip this ;; whitespace char. (read-fields-or-eof (read-char port))) ((and (not (null? comment-chars)) (memv c comment-chars)) ;; It's a comment char in the first column (or in the first ;; non-whitespace column, if "strip-leading-whitespace?" is ;; true), so skip to end of line. (let ((fake-dnlt #f)) (let loop ((c (read-char port))) (cond ((eof-object? c) '()) ((newline-check newline-type c port fake-dnlt) (read-fields-or-eof (read-char port))) (else (loop (read-char port))))))) (else ;; It's not going to be just an EOF, so try to read a row. (let ((row (let read-fields ((c c)) (cond ;; If an EOF or newline in an unquoted field, consider ;; the field and row finished. (We don't consider EOF ;; before newline to be an error, although perhaps that ;; would be a useful check for a freak premature ;; end-of-input when dealing with "well-formed" CSV). ((or (eof-object? c) (newline-check newline-type c port dnlt)) (list "")) ;; If a field separator, finish this field and cons ;; with value of recursive call to get the next field. ((memv c separator-chars) (cons "" (read-fields (read-char port)))) ;; If we're ignoring leading whitespace, and it's a ;; whitespace-chars character, then recurse to keep ;; finding the field start. ((and strip-leading-whitespace? (memv c whitespace-chars)) (read-fields (read-char port))) ;; If a quote, read a quoted field. ((and quote-char (eqv? c quote-char)) (let ((os (open-output-string))) (let loop ((c (read-char port))) (cond ((or (eof-object? c) (and (not newlines-in-quotes?) (newline-check newline-type c port dnlt))) (list (%csv:gosc os))) ((and escape-char (eqv? c escape-char)) ;FIXME can become unsynchronized (write-char (read-char port) os) (loop (read-char port))) ((and quote-char (eqv? c quote-char)) (if quote-doubling-escapes? (let ((c2 (read-char port))) (if (eqv? c2 quote-char) (begin (write-char c2 os) (loop (read-char port))) (gosc-cons os (let skip-after ((c c2)) (cond ((or (eof-object? c) (newline-check newline-type c port dnlt)) '()) ((memv c separator-chars) (read-fields (read-char port))) ((memv c whitespace-chars) ;; Note: We tolerate ;; whitespace after field ;; close quote even if ;; skip-trailing-whitespace? ;; is false. (skip-after (read-char port))) (else (%csv:csv-error 'junk-after-quote-close c))))))) (gosc-cons os (read-fields (read-char port))))) (else (write-char c os) (loop (read-char port))))))) ;; It's the start of an unquoted field. (else (let ((os (open-output-string))) (write-char c os) (let loop ((c (read-char port))) (cond ((or (eof-object? c) (newline-check newline-type c port dnlt)) (list (get-output-string os))) ((memv c separator-chars) (gosc-cons os (read-fields (read-char port)))) ((and strip-trailing-whitespace? (memv c whitespace-chars)) ;; TODO: Maybe optimize to avoid creating a new ;; output string every time we see whitespace. ;; We could use a string collector with unwrite. ;; And/or do lookahead to see whether whitespace ;; is only one character. Do this after we have ;; a better regression test suite. (let ((ws-os (open-output-string))) (write-char c ws-os) (let ws-loop ((c (read-char port))) (cond ((or (eof-object? c) (newline-check newline-type c port dnlt)) (close-output-port ws-os) (list (%csv:gosc os))) ((memv c separator-chars) (close-output-port ws-os) (gosc-cons os (read-fields (read-char port)))) ((memv c whitespace-chars) (write-char c ws-os) (ws-loop (read-char port))) (else (display (%csv:gosc ws-os) os) (write-char c os) (loop (read-char port))))))) (else (write-char c os) (loop (read-char port))))))))))) (if (null? row) row (if (eq? newline-type 'detect) (cons dnlt row) row))))))))))) (define %csv:make-portreader ;; TODO: Make a macro for the three times we list the spec attributes. (letrec ((pb (lambda (x) (if x #t #f))) (pc (lambda (x) (cond ((char? x) x) ((string? x) (case (string-length x) ((1) (string-ref x 0)) (else (%csv:type-error "make-csv-reader-maker" "character" x)))) (else (%csv:type-error "make-csv-reader-maker" "character" x))))) (pc-f (lambda (x) (cond ((not x) x) ((char? x) x) ((string? x) (case (string-length x) ((0) #f) ((1) (string-ref x 0)) (else (%csv:type-error "make-csv-reader-maker" "character or #f" x)))) (else (%csv:type-error "make-csv-reader-maker" "character or #f" x))))) (pe (lambda (x acceptable) (if (memq x acceptable) x (%csv:type-error "make-csv-reader-maker" (let ((os (open-output-string))) (display "symbol from the set " os) (write acceptable os) (%csv:gosc os)) x)))) (plc-n (lambda (x) (or (list? x) (%csv:type-error "make-csv-reader-maker" "list of characters" x)) (map pc x))) (plc (lambda (x) (let ((result (plc-n x))) (if (null? result) (%csv:type-error "make-csv-reader-maker" "non-null list of characters" x) result))))) (lambda (reader-spec) (let ((newline-type 'lax) (separator-chars '(#\,)) (quote-char #\") (quote-doubling-escapes? #t) (comment-chars '()) (whitespace-chars '(#\space)) (strip-leading-whitespace? #f) (strip-trailing-whitespace? #f) (newlines-in-quotes? #t)) ;; TODO: It's erroneous to have two entries for the same attribute in a ;; spec. However, it would be nice if we error-detected duplicate ;; entries, or at least had assq semantics (first, rather than last, ;; wins). Use csv-spec-derive's descendants for that. (for-each (lambda (item) (let ((v (cdr item))) (case (car item) ((newline-type) (set! newline-type (pe v '(cr crlf detect lax lf)))) ((separator-chars) (set! separator-chars (plc v))) ((quote-char) (set! quote-char (pc-f v))) ((quote-doubling-escapes?) (set! quote-doubling-escapes? (pb v))) ((comment-chars) (set! comment-chars (plc-n v))) ((whitespace-chars) (set! whitespace-chars (plc-n v))) ((strip-leading-whitespace?) (set! strip-leading-whitespace? (pb v))) ((strip-trailing-whitespace?) (set! strip-trailing-whitespace? (pb v))) ((newlines-in-quotes?) (set! newlines-in-quotes? (pb v)))))) reader-spec) (%csv:make-portreader/positional newline-type separator-chars quote-char quote-doubling-escapes? comment-chars whitespace-chars strip-leading-whitespace? strip-trailing-whitespace? newlines-in-quotes?))))) ;;; @defproc make-csv-reader-maker reader-spec ;;; ;;; Constructs a CSV reader constructor procedure from the @var{reader-spec}, ;;; with unspecified attributes having their default values. ;;; ;;; For example, given the input file @code{fruits.csv} with the content: ;;; ;;; @example ;;; apples | 2 | 0.42 ;;; bananas | 20 | 13.69 ;;; @end example ;;; ;;; a reader for the file's apparent format can be constructed like: ;;; ;;; @lisp ;;; (define make-food-csv-reader ;;; (make-csv-reader-maker ;;; '((separator-chars . (#\|)) ;;; (strip-leading-whitespace? . #t) ;;; (strip-trailing-whitespace? . #t)))) ;;; @end lisp ;;; ;;; The resulting @code{make-food-csv-reader} procedure accepts one argument, ;;; which is either an input port from which to read, or a string from which to ;;; read. Our example input file then can be be read by opening an input port ;;; on a file and using our new procedure to construct a reader on it: ;;; ;;; @lisp ;;; (define next-row ;;; (make-food-csv-reader (open-input-file "fruits.csv"))) ;;; @end lisp ;;; ;;; This reader, @code{next-row}, can then be called repeatedly to yield a ;;; parsed representation of each subsequent row. The parsed format is a list ;;; of strings, one string for each column. The null list is yielded to ;;; indicate that all rows have already been yielded. ;;; ;;; @lisp ;;; (next-row) @result{} ("apples" "2" "0.42") ;;; (next-row) @result{} ("bananas" "20" "13.69") ;;; (next-row) @result{} () ;;; @end lisp (define (make-csv-reader-maker reader-spec) (let ((make-portread (if (let ((p (assq 'newline-type reader-spec))) (and p (cdr p))) ;; Newline-adapting portreader-maker. (letrec ((detect-portread (%csv:make-portreader (%csv:csv-spec-derive reader-spec '((newline-type . detect))))) ;; TODO: The set of cr/crlf/lf newline-type portreaders are ;; constructed optimistically right now for two reasons: ;; 1. we don't yet sanitize reader-specs of shared structure ;; that can be mutated behind our backs; 2. eventually, we ;; want to add a "lots-o-shots?" argument that, when true, ;; would do this anyway. Consider. (cr-portread (%csv:make-portreader (%csv:csv-spec-derive reader-spec '((newline-type . cr))))) (crlf-portread (%csv:make-portreader (%csv:csv-spec-derive reader-spec '((newline-type . crlf))))) (lf-portread (%csv:make-portreader (%csv:csv-spec-derive reader-spec '((newline-type . lf)))))) (lambda () (let ((actual-portread #f)) (let ((adapt-portread (lambda (port) (let ((dnlt-row (detect-portread port))) (if (null? dnlt-row) dnlt-row (begin (set! actual-portread (case (car dnlt-row) ((cr) cr-portread) ((crlf) crlf-portread) ((lf) lf-portread) (else actual-portread))) (cdr dnlt-row))))))) (set! actual-portread adapt-portread) (lambda (port) (actual-portread port)))))) ;; Stateless portreader-maker. (let ((reusable-portread (%csv:make-portreader reader-spec))) (lambda () reusable-portread))))) (lambda (in) (let ((port (%csv:in-arg "[csv-reader]" in)) (portread (make-portread))) (lambda () (portread port)))))) ;;; @section Making Readers ;;; In addition to being constructed from the result of ;;; @code{make-csv-reader-maker}, CSV readers can also be constructed using ;;; @code{make-csv-reader}. ;;; @defproc make-csv-reader in [reader-spec] ;;; ;;; Construct a CSV reader on the input @var{in}, which is an input port or a ;;; string. If @var{reader-spec} is given, and is not the null list, then a ;;; ``one-shot'' reader constructor is constructed with that spec and used. If ;;; @var{reader-spec} is not given, or is the null list, then the default CSV ;;; reader constructor is used. For example, the reader from the ;;; @code{make-csv-reader-maker} example could alternatively have been ;;; constructed like: ;;; ;;; @lisp ;;; (define next-row ;;; (make-csv-reader ;;; (open-input-file "fruits.csv") ;;; '((separator-chars . (#\|)) ;;; (strip-leading-whitespace? . #t) ;;; (strip-trailing-whitespace? . #t)))) ;;; @end lisp (define make-csv-reader (let ((default-maker (make-csv-reader-maker '()))) (lambda (in . rest) (let ((spec (cond ((null? rest) '()) ((null? (cdr rest)) (car rest)) (else (%csv:error "make-csv-reader" "extraneous arguments" (cdr rest)))))) ((if (null? spec) default-maker (make-csv-reader-maker spec)) (%csv:in-arg "make-csv-reader" in)))))) ;;; @section High-Level Conveniences ;;; Several convenience procedures are provided for iterating over the CSV rows ;;; and for converting the CSV to a list. ;;; ;;; To the dismay of some Scheme purists, each of these procedures accepts a ;;; @var{reader-or-in} argument, which can be a CSV reader, an input port, or a ;;; string. If not a CSV reader, then the default reader constructor is used. ;;; For example, all three of the following are equivalent: ;;; ;;; @lisp ;;; (csv->list STRING ) ;;; @equiv{} ;;; (csv->list (make-csv-reader STRING )) ;;; @equiv{} ;;; (csv->list (make-csv-reader (open-input-string STRING ))) ;;; @end lisp ;;; @defproc csv-for-each proc reader-or-in ;;; ;;; Similar to Scheme's @code{for-each}, applies @var{proc}, a procedure of one ;;; argument, to each parsed CSV row in series. @var{reader-or-in} is the CSV ;;; reader, input port, or string. The return value is undefined. ;; TODO: Doc an example for this. (define (csv-for-each proc reader-or-in) (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) (let loop ((row (reader))) (or (null? row) (begin (proc row) (loop (reader))))))) ;;; @defproc csv-map proc reader-or-in ;;; ;;; Similar to Scheme's @code{map}, applies @var{proc}, a procedure of one ;;; argument, to each parsed CSV row in series, and yields a list of the values ;;; of each application of @var{proc}, in order. @var{reader-or-in} is the CSV ;;; reader, input port, or string. ;; TODO: Doc an example for this. ;; (define (csv-map proc reader-or-in) ;; (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) ;; (let ((head '())) ;; (let ((row (reader))) ;; (if (null? row) ;; head ;; (let ((pair (cons (proc row) '()))) ;; (set! head pair) ;; (let loop ((prior pair)) ;; (let ((row (reader))) ;; (if (null? row) ;; head ;; (let ((pair (cons (proc row) '()))) ;; (set-cdr! prior pair) ;; (loop pair))))))))))) (define (csv-map proc reader-or-in) (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) (let loop ((row (reader)) (ret null)) (if (null? row) (reverse ret) (let ((ret (cons (proc row) ret))) (loop (reader) ret)))))) ;;; @defproc csv->list reader-or-in ;;; ;;; Yields a list of CSV row lists from input @var{reader-or-in}, which is a ;;; CSV reader, input port, or string. ;; TODO: Doc an example for this. ;; (define (csv->list reader-or-in) ;; (let ((reader (%csv:reader-or-in-arg "csv->list" reader-or-in))) ;; (let ((head '())) ;; (let ((row (reader))) ;; (if (null? row) ;; head ;; (let ((pair (cons row '()))) ;; (set! head pair) ;; (let loop ((prior pair)) ;; (let ((row (reader))) ;; (if (null? row) ;; head ;; (let ((pair (cons row '()))) ;; (set-cdr! prior pair) ;; (loop pair))))))))))) (define (csv->list reader-or-in) (csv-map values reader-or-in)) ;;; @section Converting CSV to SXML ;;; The @code{csv->sxml} procedure can be used to convert CSV to SXML format, ;;; for processing with various XML tools. ;;; @defproc csv->sxml reader-or-in [row-element [col-elements]] ;;; ;;; Reads CSV from input @var{reader-or-in} (which is a CSV reader, input port, ;;; or string), and yields an SXML representation. If given, @var{row-element} ;;; is a symbol for the XML row element. If @var{row-element} is not given, ;;; the default is the symbol @code{row}. If given @var{col-elements} is a ;;; list of symbols for the XML column elements. If not given, or there are ;;; more columns in a row than given symbols, column element symbols are of the ;;; format @code{col-@var{n}}, where @var{n} is the column number (the first ;;; column being number 0, not 1). ;;; ;;; For example, given a CSV-format file @code{friends.csv} that has the ;;; contents: ;;; ;;; @example ;;; Binoche,Ste. Brune,33-1-2-3 ;;; Posey,Main St.,555-5309 ;;; Ryder,Cellblock 9, ;;; @end example ;;; ;;; with elements not given, the result is: ;;; ;;; @lisp ;;; (csv->sxml (open-input-file "friends.csv")) ;;; @result{} ;;; (*TOP* ;;; (row (col-0 "Binoche") (col-1 "Ste. Brune") (col-2 "33-1-2-3")) ;;; (row (col-0 "Posey") (col-1 "Main St.") (col-2 "555-5309")) ;;; (row (col-0 "Ryder") (col-1 "Cellblock 9") (col-2 ""))) ;;; @end lisp ;;; ;;; With elements given, the result is like: ;;; ;;; @lisp ;;; (csv->sxml (open-input-file "friends.csv") ;;; 'friend ;;; '(name address phone)) ;;; @result{} ;;; (*TOP* (friend (name "Binoche") ;;; (address "Ste. Brune") ;;; (phone "33-1-2-3")) ;;; (friend (name "Posey") ;;; (address "Main St.") ;;; (phone "555-5309")) ;;; (friend (name "Ryder") ;;; (address "Cellblock 9") ;;; (phone ""))) ;;; @end lisp (define csv->sxml (let* ((top-symbol (string->symbol "*TOP*")) (make-col-symbol (lambda (n) (string->symbol (string-append "col-" (number->string n))))) (default-col-elements (let loop ((i 0)) (if (= i 32) ; arbitrary magic number '() (cons (make-col-symbol i) (loop (+ 1 i))))))) ;; TODO: Have option to error when columns count doesn't match provided ;; column name list. (lambda (reader-or-in . rest) (let ((reader (%csv:reader-or-in-arg "csv->sxml" reader-or-in)) (row-element 'row) (col-elements #f)) ;; TODO: Maybe use case-lambda. (or (null? rest) (begin (set! row-element (car rest)) (let ((rest (cdr rest))) (or (null? rest) (begin (set! col-elements (car rest)) (let ((rest (cdr rest))) (or (null? rest) (%csv:error "csv->sxml" "extraneous arguments" rest)))))))) ;; TODO: We could clone and grow default-col-elements for the duration ;; of this procedure. (cons top-symbol (csv-map (lambda (row) (cons row-element (let loop ((vals row) (i 0) (names (or col-elements default-col-elements))) (if (null? vals) '() (cons (list (if (null? names) (make-col-symbol i) (car names)) (car vals)) (loop (cdr vals) (+ 1 i) (if (null? names) '() (cdr names)))))))) reader)))))) ;; TODO: Make a define-csv-reader/positional, for great constant-folding. ;; That's part of the reason some things are done the way they are. ;; TODO: Make a csv-bind, as a newbie convenience for people without advanced ;; match forms, which looks good in examples. This is better than a ;; csv-map/bind and a csv-for-each/bind. ;; ;; (csv-for-each/bind ((column-binding ...) body ...) ;; { (else => closure) | (else body ...) | } ;; input-port ;; [ csv-reader ]) ;; ;; (csv-for-each/bind ;; ((lastname firstname email) ;; ...) ;; (else => (lambda (row) (error "CSV row didn't match pattern" row))) ;; my-input-port ;; my-csv-reader) ;; TODO: Handle escapes, once we find an actual example or specification of any ;; flavor of escapes in CSV other than quote-doubling inside a quoted field. ;; TODO: Add a spec attribute for treating adjacent separators as one, or ;; skipping empty fields. This would probably only be used in practice for ;; parsing whitespace-separated input. ;; TODO: Get access to MS Excel or documentation, and make this correct. ;; ;; (define msexcel-csv-reader-spec ;; '((newline-type . crlf) ;; (separator-chars . (#\,)) ;; (quote-char . #\") ;; (quote-doubling-escapes? . #t) ;; (comment-chars . ()) ;; (whitespace-chars . (#\space)) ;; (strip-leading-whitespace? . #f) ;; (strip-trailing-whitespace? . #f) ;; (newlines-in-quotes? . #t))) ;; TODO: Maybe put this back in. ;; ;; (define default-csv-reader-spec ;; '((newline-type . lax) ;; (separator-chars . (#\,)) ;; (quote-char . #\") ;; (quote-doubling-escapes? . #t) ;; (comment-chars . ()) ;; (whitespace-chars . (#\space)) ;; (strip-leading-whitespace? . #f) ;; (strip-trailing-whitespace? . #f) ;; (newlines-in-quotes? . #t))) ;; TODO: Implement CSV writing, after CSV reading is field-tested and polished. ;; TODO: Call "close-input-port" once eof-object is hit, but make sure we still ;; can return an empty list on subsequent calls to the CSV reader. ;; TODO: Consider switching back to returning eof-object at the end of input. ;; We originally changed to returning the null list because we might want to ;; synthesize the EOF, and there is no R5RS binding for the eof-object. ;; TODO: [2005-12-09] In one test, Guile has a stack overflow when parsing a ;; row with 425 columns. Wouldn't hurt to see if we can make things more ;; tail-recursive. ;;; @unnumberedsec History ;;; @table @asis ;;; ;;; @item Version 0.10 -- 2010-04-13 -- PLaneT @code{(1 6)} ;;; Documentation fix. ;;; ;;; @item Version 0.9 -- 2009-03-14 -- PLaneT @code{(1 5)} ;;; Documentation fix. ;;; ;;; @item Version 0.8 -- 2009-02-23 -- PLaneT @code{(1 4)} ;;; Documentation changes. ;;; ;;; @item Version 0.7 -- 2009-02-22 -- PLaneT @code{(1 3)} ;;; License is now LGPL 3. Moved to author's new Scheme administration system. ;;; ;;; @item Version 0.6 -- 2008-08-12 -- PLaneT @code{(1 2)} ;;; For PLT 4 compatibility, new versions of @code{csv-map} and ;;; @code{csv->list} that don't use @code{set-cdr!} (courtesy of Doug ;;; Orleans). PLT 4 @code{if} compatibility change. Minor documentation fixes. ;;; ;;; @item Version 0.5 --- 2005-12-09 ;;; Changed a non-R5RS use of @code{letrec} to @code{let*}, caught by Guile and ;;; David Pirotte. ;;; ;;; @item Version 0.4 --- 2005-06-07 ;;; Converted to Testeez. Minor documentation changes. ;;; ;;; @item Version 0.3 --- 2004-07-21 ;;; Minor documentation changes. Test suite now disabled by default. ;;; ;;; @item Version 0.2 --- 2004-06-01 ;;; Work-around for @code{case}-related bug observed in Gauche 0.8 and 0.7.4.2 ;;; that was tickled by @code{csv-internal:make-portreader/positional}. Thanks ;;; to Grzegorz Chrupa@l{}a for reporting. ;;; ;;; @item Version 0.1 --- 2004-05-31 ;;; First release, for testing with real-world input. ;;; ;;; @end table #;(provide csv->list csv->sxml csv-for-each csv-map make-csv-reader make-csv-reader-maker)