;; ;; ;; Routines for parsing and printing comma-separated values. ;; ;; Based in part on RFC 4180, "Common Format and MIME Type for ;; Comma-Separated Values (CSV) Files", and on the Haskell Text.CSV ;; module by Jaap Weel. ;; ;; ;; Differences with the RFC: ;; ;; 1) the RFC prescribes CRLF standard network line breaks, but many ;; CSV files have platform-dependent line endings, so this library ;; accepts any sequence of CRs and LFs as a line break. ;; ;; 2) The format of header lines is exactly like a regular record ;; and the presence of a header can only be determined from the mime ;; type. available. This library treats all lines as regular ;; records. ;; ;; 3) The formal grammar specifies that fields can contain only ;; certain US ASCII characters, but the specification of the MIME ;; type allows for other character sets. This library allows all ;; characters in fields, except for the field delimiter character, ;; CRs and LFs in unquoted fields. This should make it possible to ;; parse CSV files in any encoding, but it allows for characters ;; such as tabs that the RFC may be interpreted to forbid even in ;; non-US-ASCII character sets. ;; ;; 4) According to the RFC, the records all have to have the same ;; length. This library allows variable length records. ;; ;; 5) The delimiter character is specified by the user and can be ;; a character other than comma, or an SRFI-14 character set. ;; ;; ;; Copyright 2009-2011 Ivan Raikov and the Okinawa Institute of ;; Science and Technology. ;; ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of ;; the License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; . (module csv (make-parser make-format list->csv-record csv-record->list CoreABNF->CSV ) (import scheme chicken data-structures srfi-1) (require-library srfi-13 regex) (import (only srfi-13 string-concatenate) (only regex regexp regexp-escape string-search string-substitute* string-split-fields)) (require-extension srfi-14 typeclass) (require-library abnf abnf-consumers) (import (prefix abnf abnf:) (prefix abnf-consumers abnf:) (only abnf ) ) (define-class ( A) csv) (define-record-type csv-record (list->csv-record elems) csv-record? (elems csv-record->list)) (define=> (non-escaped ) (lambda (delim) (abnf:bind-consumed->string (abnf:repetition (set (char-set-complement (char-set-union (if (char? delim) (char-set delim) delim) (string->char-set "\n\r\"")))))))) (define=> (escaped-dquote ) (lit "\"\"")) (define textdata (char-set-union (ucs-range->char-set #x20 #x22) (ucs-range->char-set #x23 #x2C) (ucs-range->char-set #x2D #x7F))) (define=> (escaped ) (lambda (escaped-dquote) (lambda (delim) (abnf:concatenation (abnf:drop-consumed dquote) (abnf:bind-consumed->string (abnf:repetition (abnf:alternatives escaped-dquote (set (char-set-union (if (char? delim) (char-set delim) delim) textdata))))) (abnf:drop-consumed dquote))))) (define (field non-escaped escaped) (lambda (delim) (abnf:alternatives (escaped delim) (non-escaped delim) ))) (define=> (record ) (lambda (field) (lambda (delim) (abnf:bind-consumed-strings->list list->csv-record (abnf:concatenation (field delim) (abnf:repetition (abnf:concatenation (abnf:drop-consumed (if (char? delim) (char delim) (set delim))) (field delim)))))))) (define=> (csv ) (lambda (record) (lambda (delim) (abnf:repetition (abnf:concatenation (record delim) (abnf:drop-consumed (abnf:repetition1 (set-from-string "\r\n")))))))) (define (->char-list s) (if (string? s) (string->list s) s)) (define (check-delimiter d) (if (not (or (char? d) (char-set? d))) (error 'parser "delimiter is not a character or a character set")) (cond ((char? d) (case d ((#\newline #\return #\") (error 'parser "delimiter character is one of newline, carriage return or quotation mark")))) ((char-set? d) (if (or (char-set-contains? d #\newline) (char-set-contains? d #\return) (char-set-contains? d #\")) (error 'parser "delimiter character set includes newline, carriage return or quotation mark"))))) (define (err s) (print "CSV parser error on stream: " s) (list)) (define=> (make-parser ) (lambda rest (let ((delimiter (if (null? rest) #\, (car rest)))) (check-delimiter delimiter) (let ((p (csv delimiter))) (lambda (s) (p (compose reverse car) err `(() ,(->char-list s)))))))) (define (CoreABNF->CSV A) (let* ((non-escaped (non-escaped A)) (escaped-dquote (escaped-dquote A)) (escaped ((escaped A) escaped-dquote)) (field (field non-escaped escaped)) (record ((record A) field)) (csv ((csv A) record))) (make- A csv))) (define rx-newline (regexp "[^\r\n]+")) (define (normalise-newlines s) (string-concatenate (intersperse (string-split-fields rx-newline s) "\r\n"))) (define rx-quote (regexp "\"")) (define (normalise-quotes s) (string-substitute* s `((,rx-quote . "\"\"")))) (define (make-format-cell delimiter) (define special-strs (map (compose regexp-escape ->string) (list delimiter #\" #\newline #\return))) (define rx-special (regexp (string-concatenate (intersperse special-strs "|")))) (define (format-cell x) (let ([str (format "~A" x)]) (if (string-search rx-special str) (string-append "\"" (normalise-newlines (normalise-quotes str)) "\"") str))) format-cell) (define (make-format . rest) (let-optionals rest ((delimiter #\,)) (define format-cell (make-format-cell delimiter)) (define (format-record rec) (and (csv-record? rec) (let ((ls (csv-record->list rec))) (string-concatenate (intersperse (map format-cell ls) (->string delimiter)))))) (define (format-csv ls) (and (pair? ls) (string-concatenate (append (intersperse (map format-record ls) "\r\n") (list "\r\n"))))) (values format-cell format-record format-csv))) )