;;;; srfi-14.scm - Shivers' reference implementation of SRFI-14 (declare (fixnum) (disable-interrupts)) (module srfi-14 (->char-set char-set char-set->list char-set->string char-set-adjoin char-set-adjoin! char-set-any char-set-complement char-set-complement! char-set-contains? char-set-copy char-set-count char-set-cursor char-set-cursor-next char-set-delete char-set-delete! char-set-diff+intersection char-set-diff+intersection! char-set-difference char-set-difference! char-set-every char-set-filter char-set-filter! char-set-fold char-set-for-each char-set-hash char-set-intersection char-set-intersection! char-set-map char-set-ref char-set-size char-set-unfold char-set-unfold! char-set-union char-set-union! char-set-xor char-set-xor! char-set:ascii char-set:blank char-set:digit char-set:empty char-set:full char-set:graphic char-set:hex-digit char-set:iso-control char-set:letter char-set:letter+digit char-set:lower-case char-set:printing char-set:punctuation char-set:symbol char-set:title-case char-set:upper-case char-set:whitespace char-set<= char-set= char-set? end-of-char-set? list->char-set list->char-set! string->char-set string->char-set! ucs-range->char-set ucs-range->char-set!) (import (scheme) (chicken base) (chicken fixnum) (chicken platform)) (register-feature! 'srfi-14) (define (%latin1->char n) (integer->char n)) (define (%char->latin1 c) (char->integer c)) ;;; SRFI-14 character-sets library -*- Scheme -*- ;;; ;;; - Ported from MIT Scheme runtime by Brian D. Carlstrom. ;;; - Massively rehacked & extended by Olin Shivers 6/98. ;;; - Massively redesigned and rehacked 5/2000 during SRFI process. ;;; At this point, the code bears the following relationship to the ;;; MIT Scheme code: "This is my grandfather's axe. My father replaced ;;; the head, and I have replaced the handle." Nonetheless, we preserve ;;; the MIT Scheme copyright: ;;; Copyright (c) 1988-1995 Massachusetts Institute of Technology ;;; The MIT Scheme license is a "free software" license. See the end of ;;; this file for the tedious details. ;;; Exports: ;;; char-set? char-set= char-set<= ;;; char-set-hash ;;; char-set-cursor char-set-ref char-set-cursor-next end-of-char-set? ;;; char-set-fold char-set-unfold char-set-unfold! ;;; char-set-for-each char-set-map ;;; char-set-copy ;;; ;;; char-set list->char-set string->char-set ;;; char-set! list->char-set! string->char-set! ;;; ;;; filterchar-set ucs-range->char-set ->char-set ;;; filterchar-set! ucs-range->char-set! ;;; ;;; char-set->list char-set->string ;;; ;;; char-set-size char-set-count char-set-contains? ;;; char-set-every char-set-any ;;; ;;; char-set-adjoin char-set-delete ;;; char-set-adjoin! char-set-delete! ;;; ;;; char-set-complement char-set-union char-set-intersection char-set-difference ;;; char-set-complement! char-set-union! char-set-intersection! char-set-difference! ;;; ;;; char-set-difference char-set-xor char-set-diff+intersection ;;; char-set-difference! char-set-xor! char-set-diff+intersection! ;;; ;;; char-set:lower-case char-set:upper-case char-set:title-case ;;; char-set:letter char-set:digit char-set:letter+digit ;;; char-set:graphic char-set:printing char-set:whitespace ;;; char-set:iso-control char-set:punctuation char-set:symbol ;;; char-set:hex-digit char-set:blank char-set:ascii ;;; char-set:empty char-set:full ;;; Imports ;;; This code has the following non-R5RS dependencies: ;;; - ERROR ;;; - %LATIN1->CHAR %CHAR->LATIN1 ;;; - LET-OPTIONALS* and :OPTIONAL macros for parsing, checking & defaulting ;;; optional arguments from rest lists. ;;; - BITWISE-AND for CHAR-SET-HASH ;;; - The SRFI-19 DEFINE-RECORD-TYPE record macro ;;; - A simple CHECK-ARG procedure: ;;; (lambda (pred val caller) (if (not (pred val)) (error val caller))) ;;; This is simple code, not great code. Char sets are represented as 256-char ;;; strings. If char I is ASCII/Latin-1 0, then it isn't in the set; if char I ;;; is ASCII/Latin-1 1, then it is in the set. ;;; - Should be rewritten to use bit strings or byte vecs. ;;; - Is Latin-1 specific. Would certainly have to be rewritten for Unicode. ;;; See the end of the file for porting and performance-tuning notes. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (make-char-set s) (##sys#make-structure 'char-set s)) (define (char-set:s cs) (##sys#slot cs 1)) (define (char-set? x) (##sys#structure? x 'char-set)) #| (define-record-type :char-set (make-char-set s) char-set? (s char-set:s)) |# (define (%string-copy s) (substring s 0 (string-length s))) ;;; Parse, type-check & default a final optional BASE-CS parameter from ;;; a rest argument. Return a *fresh copy* of the underlying string. ;;; The default is the empty set. The PROC argument is to help us ;;; generate informative error exceptions. (define (%default-base maybe-base proc) (if (pair? maybe-base) (let ((bcs (car maybe-base)) (tail (cdr maybe-base))) (if (null? tail) (if (char-set? bcs) (%string-copy (char-set:s bcs)) (##sys#error "BASE-CS parameter not a char-set" proc bcs)) (##sys#error "Expected final base char set -- too many parameters" proc maybe-base))) (make-string 256 (%latin1->char 0)))) ;;; If CS is really a char-set, do CHAR-SET:S, otw report an error msg on ;;; behalf of our caller, PROC. This procedure exists basically to provide ;;; explicit error-checking & reporting. (define (%char-set:s/check cs proc) (let lp ((cs cs)) (if (char-set? cs) (char-set:s cs) (lp (##sys#error proc "Not a char-set" cs))))) ;;; These internal functions hide a lot of the dependency on the ;;; underlying string representation of char sets. They should be ;;; inlined if possible. (define-inline (si=0? s i) (zero? (%char->latin1 (string-ref s i)))) (define-inline (si=1? s i) (not (si=0? s i))) (define-inline (si s i) (%char->latin1 (string-ref s i))) (define-inline (%set0! s i) (string-set! s i c0)) (define-inline (%set1! s i) (string-set! s i c1)) (define c0 (%latin1->char 0)) (define c1 (%latin1->char 1)) ;;; These do various "s[i] := s[i] op val" operations -- see ;;; %CHAR-SET-ALGEBRA. They are used to implement the various ;;; set-algebra procedures. (define-inline (setv! s i v) (string-set! s i (%latin1->char v))) ; SET to a Value. (define-inline (%not! s i v) (setv! s i (- 1 v))) (define-inline (%and! s i v) (if (zero? v) (%set0! s i))) (define-inline (%or! s i v) (if (not (zero? v)) (%set1! s i))) (define-inline (%minus! s i v) (if (not (zero? v)) (%set0! s i))) (define-inline (%xor! s i v) (if (not (zero? v)) (setv! s i (- 1 (si s i))))) (define (char-set-copy cs) (make-char-set (%string-copy (%char-set:s/check cs 'char-set-copy)))) (define char-set= (lambda rest (or (null? rest) (let* ((cs1 (car rest)) (rest (cdr rest)) (s1 (%char-set:s/check cs1 'char-set=))) (let lp ((rest rest)) (or (not (pair? rest)) (and (string=? s1 (%char-set:s/check (car rest) 'char-set=)) (lp (cdr rest))))))))) (define char-set<= (lambda rest (or (null? rest) (let ((cs1 (car rest)) (rest (cdr rest))) (let lp ((s1 (%char-set:s/check cs1 'char-set<=)) (rest rest)) (or (not (pair? rest)) (let ((s2 (%char-set:s/check (car rest) 'char-set<=)) (rest (cdr rest))) (if (eq? s1 s2) (lp s2 rest) ; Fast path (let lp2 ((i 255)) ; Real test (if (< i 0) (lp s2 rest) (and (<= (si s1 i) (si s2 i)) (lp2 (- i 1))))))))))) )) ;;; Hash ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Compute (c + 37 c + 37^2 c + ...) modulo BOUND. ;;; If you keep BOUND small enough, the intermediate calculations will ;;; always be fixnums. How small is dependent on the underlying Scheme system; ;;; we use a default BOUND of 2^22 = 4194304, which should hack it in ;;; Schemes that give you at least 29 signed bits for fixnums. The core ;;; calculation that you don't want to overflow is, worst case, ;;; (+ 65535 (* 37 (- bound 1))) ;;; where 65535 is the max character code. Choose the default BOUND to be the ;;; biggest power of two that won't cause this expression to fixnum overflow, ;;; and everything will be copacetic. (define (char-set-hash cs . maybe-bound) (let ((bound (optional maybe-bound 4194304))) (if (zero? bound) (set! bound 4194304)) (##sys#check-exact bound 'char-set-hash) (let* ((s (%char-set:s/check cs 'char-set-hash)) ;; Compute a 111...1 mask that will cover BOUND-1: (mask (let lp ((i #x10000)) ; Let's skip first 16 iterations, eh? (if (>= i bound) (- i 1) (lp (+ i i)))))) (let lp ((i 255) (ans 0)) (if (< i 0) (modulo ans bound) (lp (- i 1) (if (si=0? s i) ans (fxand mask (+ (* 37 ans) i)))))))) ) (define (char-set-contains? cs char) (##sys#check-char char 'char-set-contains?) (si=1? (%char-set:s/check cs 'char-set-contains?) ; (%char->latin1 (check-arg char? char char-set-contains?)))) (%char->latin1 char) ) ) (define (char-set-size cs) (let ((s (%char-set:s/check cs 'char-set-size))) (let lp ((i 255) (size 0)) (if (< i 0) size (lp (- i 1) (+ size (si s i))))))) (define (char-set-count pred cset) ; (check-arg procedure? pred char-set-count) (let ((s (%char-set:s/check cset 'char-set-count))) (let lp ((i 255) (count 0)) (if (< i 0) count (lp (- i 1) (if (and (si=1? s i) (pred (%latin1->char i))) (+ count 1) count)))))) ;;; -- Adjoin & delete (define (%set-char-set set proc cs chars) (let ((s (%string-copy (%char-set:s/check cs proc)))) (for-each (lambda (c) (set s (%char->latin1 c))) chars) (make-char-set s))) (define (%set-char-set! set proc cs chars) (let ((s (%char-set:s/check cs proc))) (for-each (lambda (c) (set s (%char->latin1 c))) chars)) cs) (define (char-set-adjoin cs . chars) (%set-char-set %set1! 'char-set-adjoin cs chars)) (define (char-set-adjoin! cs . chars) (%set-char-set! %set1! 'char-set-adjoin! cs chars)) (define (char-set-delete cs . chars) (%set-char-set %set0! 'char-set-delete cs chars)) (define (char-set-delete! cs . chars) (%set-char-set! %set0! 'char-set-delete! cs chars)) ;;; Cursors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Simple implementation. A cursors is an integer index into the ;;; mark vector, and -1 for the end-of-char-set cursor. ;;; ;;; If we represented char sets as a bit set, we could do the following ;;; trick to pick the lowest bit out of the set: ;;; (count-bits (xor (- cset 1) cset)) ;;; (But first mask out the bits already scanned by the cursor first.) (define (char-set-cursor cset) (%char-set-cursor-next cset 256 'char-set-cursor)) (define (end-of-char-set? cursor) (< cursor 0)) (define (char-set-ref cset cursor) (%latin1->char cursor)) (define (char-set-cursor-next cset cursor) (##sys#check-exact cursor 'char-set-cursor-next) ; (check-arg (lambda (i) (and (integer? i) (exact? i) (<= 0 i 255))) cursor ; char-set-cursor-next) (%char-set-cursor-next cset cursor 'char-set-cursor-next)) (define (%char-set-cursor-next cset cursor proc) ; Internal (let ((s (%char-set:s/check cset proc))) (let lp ((cur cursor)) (let ((cur (- cur 1))) (if (or (< cur 0) (si=1? s cur)) cur (lp cur)))))) ;;; -- for-each map fold unfold every any (define (char-set-for-each proc cs) ; (check-arg procedure? proc char-set-for-each) (let ((s (%char-set:s/check cs 'char-set-for-each))) (let lp ((i 255)) (cond ((>= i 0) (if (si=1? s i) (proc (%latin1->char i))) (lp (- i 1))))))) (define (char-set-map proc cs) ; (check-arg procedure? proc char-set-map) (let ((s (%char-set:s/check cs 'char-set-map)) (ans (make-string 256 c0))) (let lp ((i 255)) (cond ((>= i 0) (if (si=1? s i) (%set1! ans (%char->latin1 (proc (%latin1->char i))))) (lp (- i 1))))) (make-char-set ans))) (define (char-set-fold kons knil cs) ; (check-arg procedure? kons char-set-fold) (let ((s (%char-set:s/check cs 'char-set-fold))) (let lp ((i 255) (ans knil)) (if (< i 0) ans (lp (- i 1) (if (si=0? s i) ans (kons (%latin1->char i) ans))))))) (define (char-set-every pred cs) ; (check-arg procedure? pred char-set-every) (let ((s (%char-set:s/check cs 'char-set-every))) (let lp ((i 255)) (or (< i 0) (and (or (si=0? s i) (pred (%latin1->char i))) (lp (- i 1))))))) (define (char-set-any pred cs) ; (check-arg procedure? pred char-set-any) (let ((s (%char-set:s/check cs 'char-set-any))) (let lp ((i 255)) (and (>= i 0) (or (and (si=1? s i) (pred (%latin1->char i))) (lp (- i 1))))))) (define (%char-set-unfold! proc p f g s seed) ; (check-arg procedure? p proc) ; (check-arg procedure? f proc) ; (check-arg procedure? g proc) (let lp ((seed seed)) (cond ((not (p seed)) ; P says we are done. (%set1! s (%char->latin1 (f seed))) ; Add (F SEED) to set. (lp (g seed)))))) ; Loop on (G SEED). (define (char-set-unfold p f g seed . maybe-base) (let ((bs (%default-base maybe-base char-set-unfold))) (%char-set-unfold! char-set-unfold p f g bs seed) (make-char-set bs))) (define (char-set-unfold! p f g seed base-cset) (%char-set-unfold! char-set-unfold! p f g (%char-set:s/check base-cset 'char-set-unfold!) seed) base-cset) ;;; list <--> char-set (define (%list->char-set! chars s) (for-each (lambda (char) (%set1! s (%char->latin1 char))) chars)) (define (char-set . chars) (let ((s (make-string 256 c0))) (%list->char-set! chars s) (make-char-set s))) (define (list->char-set chars . maybe-base) (let ((bs (%default-base maybe-base list->char-set))) (%list->char-set! chars bs) (make-char-set bs))) (define (list->char-set! chars base-cs) (%list->char-set! chars (%char-set:s/check base-cs 'list->char-set!)) base-cs) (define (char-set->list cs) (let ((s (%char-set:s/check cs 'char-set->list))) (let lp ((i 255) (ans '())) (if (< i 0) ans (lp (- i 1) (if (si=0? s i) ans (cons (%latin1->char i) ans))))))) ;;; string <--> char-set (define (%string->char-set! str bs proc) (##sys#check-string str proc) ; (check-arg string? str proc) (do ((i (- (string-length str) 1) (- i 1))) ((< i 0)) (%set1! bs (%char->latin1 (string-ref str i))))) (define (string->char-set str . maybe-base) (let ((bs (%default-base maybe-base string->char-set))) (%string->char-set! str bs 'string->char-set) (make-char-set bs))) (define (string->char-set! str base-cs) (%string->char-set! str (%char-set:s/check base-cs 'string->char-set!) 'string->char-set!) base-cs) (define (char-set->string cs) (let* ((s (%char-set:s/check cs 'char-set->string)) (ans (make-string (char-set-size cs)))) (let lp ((i 255) (j 0)) (if (< i 0) ans (let ((j (if (si=0? s i) j (begin (string-set! ans j (%latin1->char i)) (+ j 1))))) (lp (- i 1) j)))))) ;;; -- UCS-range -> char-set (define (%ucs-range->char-set! lower upper error? bs proc) (##sys#check-exact lower proc) (##sys#check-exact upper proc) ; (check-arg (lambda (x) (and (integer? x) (exact? x) (<= 0 x))) lower proc) ; (check-arg (lambda (x) (and (integer? x) (exact? x) (<= lower x))) upper proc) (if (and (< lower upper) (< 256 upper) error?) (##sys#error "Requested UCS range contains unavailable characters -- this implementation only supports Latin-1" proc lower upper)) (let lp ((i (- (min upper 256) 1))) (cond ((<= lower i) (%set1! bs i) (lp (- i 1)))))) (define (ucs-range->char-set lower upper . rest) (let-optionals* rest ((error? #f) rest) (let ((bs (%default-base rest ucs-range->char-set))) (%ucs-range->char-set! lower upper error? bs 'ucs-range->char-set) (make-char-set bs)))) (define (ucs-range->char-set! lower upper error? base-cs) (%ucs-range->char-set! lower upper error? (%char-set:s/check base-cs 'ucs-range->char-set!) 'ucs-range->char-set) base-cs) ;;; -- predicate -> char-set (define (%char-set-filter! pred ds bs proc) ; (check-arg procedure? pred proc) (let lp ((i 255)) (cond ((>= i 0) (if (and (si=1? ds i) (pred (%latin1->char i))) (%set1! bs i)) (lp (- i 1)))))) (define (char-set-filter predicate domain . maybe-base) (let ((bs (%default-base maybe-base char-set-filter))) (%char-set-filter! predicate (%char-set:s/check domain 'char-set-filter!) bs char-set-filter) (make-char-set bs))) (define (char-set-filter! predicate domain base-cs) (%char-set-filter! predicate (%char-set:s/check domain 'char-set-filter!) (%char-set:s/check base-cs 'char-set-filter!) char-set-filter!) base-cs) ;;; {string, char, char-set, char predicate} -> char-set (define (->char-set x) (cond ((char-set? x) x) ((string? x) (string->char-set x)) ((char? x) (char-set x)) (else (##sys#error '->char-set "Not a charset, string or char." x)))) ;;; Set algebra ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; The exported ! procs are "linear update" -- allowed, but not required, to ;;; side-effect their first argument when computing their result. In other ;;; words, you must use them as if they were completely functional, just like ;;; their non-! counterparts, and you must additionally ensure that their ;;; first arguments are "dead" at the point of call. In return, we promise a ;;; more efficient result, plus allowing you to always assume char-sets are ;;; unchangeable values. ;;; Apply P to each index and its char code in S: (P I VAL). ;;; Used by the set-algebra ops. (define (%string-iter p s) (let lp ((i (- (string-length s) 1))) (cond ((>= i 0) (p i (%char->latin1 (string-ref s i))) (lp (- i 1)))))) ;;; String S represents some initial char-set. (OP s i val) does some ;;; kind of s[i] := s[i] op val update. Do ;;; S := S OP CSETi ;;; for all the char-sets in the list CSETS. The n-ary set-algebra ops ;;; all use this internal proc. (define (%char-set-algebra s csets op proc) (for-each (lambda (cset) (let ((s2 (%char-set:s/check cset proc))) (let lp ((i 255)) (cond ((>= i 0) (op s i (si s2 i)) (lp (- i 1))))))) csets)) ;;; -- Invert (define (char-set-complement cs) (let ((s (%char-set:s/check cs 'char-set-complement)) (ans (make-string 256))) (%string-iter (lambda (i v) (%not! ans i v)) s) (make-char-set ans))) (define (char-set-complement! cset) (let ((s (%char-set:s/check cset 'char-set-complement!))) (%string-iter (lambda (i v) (%not! s i v)) s)) cset) ;;; -- Union (define (char-set-union! cset1 . csets) (%char-set-algebra (%char-set:s/check cset1 'char-set-union!) csets %or! 'char-set-union!) cset1) (define (char-set-union . csets) (if (pair? csets) (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-union)))) (%char-set-algebra s (cdr csets) %or! 'char-set-union) (make-char-set s)) (char-set-copy char-set:empty))) ;;; -- Intersection (define (char-set-intersection! cset1 . csets) (%char-set-algebra (%char-set:s/check cset1 'char-set-intersection!) csets %and! 'char-set-intersection!) cset1) (define (char-set-intersection . csets) (if (pair? csets) (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-intersection)))) (%char-set-algebra s (cdr csets) %and! 'char-set-intersection) (make-char-set s)) (char-set-copy char-set:full))) ;;; -- Difference (define (char-set-difference! cset1 . csets) (%char-set-algebra (%char-set:s/check cset1 'char-set-difference!) csets %minus! 'char-set-difference!) cset1) (define (char-set-difference cs1 . csets) (if (pair? csets) (let ((s (%string-copy (%char-set:s/check cs1 'char-set-difference)))) (%char-set-algebra s csets %minus! 'char-set-difference) (make-char-set s)) (char-set-copy cs1))) ;;; -- Xor (define (char-set-xor! cset1 . csets) (%char-set-algebra (%char-set:s/check cset1 'char-set-xor!) csets %xor! 'char-set-xor!) cset1) (define (char-set-xor . csets) (if (pair? csets) (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-xor)))) (%char-set-algebra s (cdr csets) %xor! 'char-set-xor) (make-char-set s)) (char-set-copy char-set:empty))) ;;; -- Difference & intersection (define (%char-set-diff+intersection! diff int csets proc) (for-each (lambda (cs) (%string-iter (lambda (i v) (if (not (zero? v)) (cond ((si=1? diff i) (%set0! diff i) (%set1! int i))))) (%char-set:s/check cs proc))) csets)) (define (char-set-diff+intersection! cs1 cs2 . csets) (let ((s1 (%char-set:s/check cs1 'char-set-diff+intersection!)) (s2 (%char-set:s/check cs2 'char-set-diff+intersection!))) (%string-iter (lambda (i v) (if (zero? v) (%set0! s2 i) (if (si=1? s2 i) (%set0! s1 i)))) s1) (%char-set-diff+intersection! s1 s2 csets 'char-set-diff+intersection!)) (values cs1 cs2)) (define (char-set-diff+intersection cs1 . csets) (let ((diff (string-copy (%char-set:s/check cs1 'char-set-diff+intersection))) (int (make-string 256 c0))) (%char-set-diff+intersection! diff int csets 'char-set-diff+intersection) (values (make-char-set diff) (make-char-set int)))) ;;;; System character sets ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; These definitions are for Latin-1. ;;; ;;; If your Scheme implementation allows you to mark the underlying strings ;;; as immutable, you should do so -- it would be very, very bad if a client's ;;; buggy code corrupted these constants. (define char-set:empty (char-set)) (define char-set:full (char-set-complement char-set:empty)) (define char-set:lower-case (let* ((a-z (ucs-range->char-set #x61 #x7B)) (latin1 (ucs-range->char-set! #xdf #xf7 #t a-z)) (latin2 (ucs-range->char-set! #xf8 #x100 #t latin1))) (char-set-adjoin! latin2 (%latin1->char #xb5)))) (define char-set:upper-case (let ((A-Z (ucs-range->char-set #x41 #x5B))) ;; Add in the Latin-1 upper-case chars. (ucs-range->char-set! #xd8 #xdf #t (ucs-range->char-set! #xc0 #xd7 #t A-Z)))) (define char-set:title-case char-set:empty) (define char-set:letter (let ((u/l (char-set-union char-set:upper-case char-set:lower-case))) (char-set-adjoin! u/l (%latin1->char #xaa) ; FEMININE ORDINAL INDICATOR (%latin1->char #xba)))) ; MASCULINE ORDINAL INDICATOR (define char-set:digit (string->char-set "0123456789")) (define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF")) (define char-set:letter+digit (char-set-union char-set:letter char-set:digit)) (define char-set:punctuation (let ((ascii (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}")) (latin-1-chars (map %latin1->char '(#xA1 ; INVERTED EXCLAMATION MARK #xAB ; LEFT-POINTING DOUBLE ANGLE QUOTATION MARK #xAD ; SOFT HYPHEN #xB7 ; MIDDLE DOT #xBB ; RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK #xBF)))) ; INVERTED QUESTION MARK (list->char-set! latin-1-chars ascii))) (define char-set:symbol (let ((ascii (string->char-set "$+<=>^`|~")) (latin-1-chars (map %latin1->char '(#x00A2 ; CENT SIGN #x00A3 ; POUND SIGN #x00A4 ; CURRENCY SIGN #x00A5 ; YEN SIGN #x00A6 ; BROKEN BAR #x00A7 ; SECTION SIGN #x00A8 ; DIAERESIS #x00A9 ; COPYRIGHT SIGN #x00AC ; NOT SIGN #x00AE ; REGISTERED SIGN #x00AF ; MACRON #x00B0 ; DEGREE SIGN #x00B1 ; PLUS-MINUS SIGN #x00B4 ; ACUTE ACCENT #x00B6 ; PILCROW SIGN #x00B8 ; CEDILLA #x00D7 ; MULTIPLICATION SIGN #x00F7)))) ; DIVISION SIGN (list->char-set! latin-1-chars ascii))) (define char-set:graphic (char-set-union char-set:letter+digit char-set:punctuation char-set:symbol)) (define char-set:whitespace (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION #x0A ; LINE FEED #x0B ; VERTICAL TABULATION #x0C ; FORM FEED #x0D ; CARRIAGE RETURN #x20 ; SPACE #xA0)))) (define char-set:printing (char-set-union char-set:whitespace char-set:graphic)) ; NO-BREAK SPACE (define char-set:blank (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION #x20 ; SPACE #xA0)))) ; NO-BREAK SPACE (define char-set:iso-control (ucs-range->char-set! #x7F #xA0 #t (ucs-range->char-set 0 32))) (define char-set:ascii (ucs-range->char-set 0 128)) ) ; module ;;; Porting & performance-tuning notes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; See the section at the beginning of this file on external dependencies. ;;; ;;; First and foremost, rewrite this code to use bit vectors of some sort. ;;; This will give big speedup and memory savings. ;;; ;;; - LET-OPTIONALS* macro. ;;; This is only used once. You can rewrite the use, port the hairy macro ;;; definition (which is implemented using a Clinger-Rees low-level ;;; explicit-renaming macro system), or port the simple, high-level ;;; definition, which is less efficient. ;;; ;;; - :OPTIONAL macro ;;; Very simply defined using an R5RS high-level macro. ;;; ;;; Implementations that can arrange for the base char sets to be immutable ;;; should do so. (E.g., Scheme 48 allows one to mark a string as immutable, ;;; which can be used to protect the underlying strings.) It would be very, ;;; very bad if a client's buggy code corrupted these constants. ;;; ;;; There is a fair amount of argument checking. This is, strictly speaking, ;;; unnecessary -- the actual body of the procedures will blow up if an ;;; illegal value is passed in. However, the error message will not be as good ;;; as if the error were caught at the "higher level." Also, a very, very ;;; smart Scheme compiler may be able to exploit having the type checks done ;;; early, so that the actual body of the procedures can assume proper values. ;;; This isn't likely; this kind of compiler technology isn't common any ;;; longer. ;;; ;;; The overhead of optional-argument parsing is irritating. The optional ;;; arguments must be consed into a rest list on entry, and then parsed out. ;;; Function call should be a matter of a few register moves and a jump; it ;;; should not involve heap allocation! Your Scheme system may have a superior ;;; non-R5RS optional-argument system that can eliminate this overhead. If so, ;;; then this is a prime candidate for optimising these procedures, ;;; *especially* the many optional BASE-CS parameters. ;;; ;;; Note that optional arguments are also a barrier to procedure integration. ;;; If your Scheme system permits you to specify alternate entry points ;;; for a call when the number of optional arguments is known in a manner ;;; that enables inlining/integration, this can provide performance ;;; improvements. ;;; ;;; There is enough *explicit* error checking that *all* internal operations ;;; should *never* produce a type or index-range error. Period. Feel like ;;; living dangerously? *Big* performance win to be had by replacing string ;;; and record-field accessors and setters with unsafe equivalents in the ;;; code. Similarly, fixnum-specific operators can speed up the arithmetic ;;; done on the index values in the inner loops. The only arguments that are ;;; not completely error checked are ;;; - string lists (complete checking requires time proportional to the ;;; length of the list) ;;; - procedure arguments, such as char->char maps & predicates. ;;; There is no way to check the range & domain of procedures in Scheme. ;;; Procedures that take these parameters cannot fully check their ;;; arguments. But all other types to all other procedures are fully ;;; checked. ;;; ;;; This does open up the alternate possibility of simply *removing* these ;;; checks, and letting the safe primitives raise the errors. On a dumb ;;; Scheme system, this would provide speed (by eliminating the redundant ;;; error checks) at the cost of error-message clarity. ;;; ;;; In an interpreted Scheme, some of these procedures, or the internal ;;; routines with % prefixes, are excellent candidates for being rewritten ;;; in C. ;;; ;;; It would also be nice to have the ability to mark some of these ;;; routines as candidates for inlining/integration. ;;; ;;; See the comments preceding the hash function code for notes on tuning ;;; the default bound so that the code never overflows your implementation's ;;; fixnum size into bignum calculation. ;;; ;;; All the %-prefixed routines in this source code are written ;;; to be called internally to this library. They do *not* perform ;;; friendly error checks on the inputs; they assume everything is ;;; proper. They also do not take optional arguments. These two properties ;;; save calling overhead and enable procedure integration -- but they ;;; are not appropriate for exported routines. ;;; Copyright notice ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Copyright (c) 1988-1995 Massachusetts Institute of Technology ;;; ;;; This material was developed by the Scheme project at the Massachusetts ;;; Institute of Technology, Department of Electrical Engineering and ;;; Computer Science. Permission to copy and modify this software, to ;;; redistribute either the original software or a modified version, and ;;; to use this software for any purpose is granted, subject to the ;;; following restrictions and understandings. ;;; ;;; 1. Any copy made of this software must include this copyright notice ;;; in full. ;;; ;;; 2. Users of this software agree to make their best efforts (a) to ;;; return to the MIT Scheme project any improvements or extensions that ;;; they make, so that these may be included in future releases; and (b) ;;; to inform MIT of noteworthy uses of this software. ;;; ;;; 3. All materials developed as a consequence of the use of this ;;; software shall duly acknowledge such use, in accordance with the usual ;;; standards of acknowledging credit in academic research. ;;; ;;; 4. MIT has made no warrantee or representation that the operation of ;;; this software will be error-free, and MIT is under no obligation to ;;; provide any services, by way of maintenance, update, or otherwise. ;;; ;;; 5. In conjunction with products arising from the use of this material, ;;; there shall be no use of the name of the Massachusetts Institute of ;;; Technology nor of any adaptation thereof in any advertising, ;;; promotional, or sales literature without prior written consent from ;;; MIT in each case.