;;; fancypants - Automatic ASCII smart quotes and ligature handling for SXML ; ; Copyright (c) 2006-2010 Peter Bex (Peter.Bex@xs4all.nl) ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; 1. Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; 2. Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; 3. Neither the name of Peter Bex nor the names of any contributors may ; be used to endorse or promote products derived from this software ; without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY PETER BEX AND CONTRIBUTORS ``AS IS'' AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS BE LIABLE ; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ; OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ; SUCH DAMAGE. (module fancypants (fancify smarten-quotes make-fancy-rules make-smart-quote-rules default-exceptions default-ligature-map default-punctuation-map default-arrow-map default-map all-quotes) (import chicken scheme) (use data-structures srfi-1 srfi-13) (cond-expand (total-irregex (require-library irregex) (import irregex)) (else (require-library regex) (import (rename irregex (irregex-match-start irregex-match-start-index) (irregex-match-end irregex-match-end-index))) (define (irregex-match-valid-index? m i) (and (irregex-match-start-index m i) #t)))) ;; Split up a string at predefined points, returning a list with the pieces. (define (point-split-string string points) (let loop ((p points)) (cond ((null? p) (list string)) ((= (string-length string) 0) '()) ((string-contains string (car p)) => (lambda (start) (let ((len (string-length (car p)))) (append (point-split-string (string-take string start) points) (list (string-copy string start (+ start len))) (point-split-string (string-drop string (+ start len)) points))))) (else (loop (cdr p)))))) ;; See http://www.unicode.org/charts/PDF/UFB00.pdf (define default-ligature-map '(("ffi" . (& "#xfb03")) ("ffl" . (& "#xfb04")) ("ff" . (& "#xfb00")) ("fi" . (& "#xfb01")) ("fl" . (& "#xfb02")) ;; ("st" . (& "#xfb06")) ;; This one is too conspicuous in standard fonts ("ft" . (& "#xfb05")))) ;; See http://www.unicode.org/charts/PDF/U2000.pdf (define default-punctuation-map '(("..." . (& "#x2026")) (".." . (& "#x2025")) (". . ." . (& "#x2026")) ;; We could also use #x2013, #x2014 instead. ("---" . (& "mdash")) ("--" . (& "ndash")))) ;; See http://www.unicode.org/charts/PDF/U2190.pdf (define default-arrow-map '(("->>" . (& "#x21a0")) ("<<-" . (& "#x219e")) ("->|" . (& "#x21e5")) ("|<-" . (& "#x21e4")) ("<->" . (& "#x2194")) ("->" . (& "#x2192")) ("<-" . (& "#x2190")) ("<=>" . (& "#x21d4")) ("=>" . (& "#x21d2")) ("<=" . (& "#x21d0")))) (define default-map `(,@default-ligature-map ,@default-punctuation-map ,@default-arrow-map)) ;; Don't try to do anything with these (define default-exceptions '(head script pre code kbd samp @)) ;; Split ASCII "ligatures" and such from their surrounding strings and change ;; them to their respective Unicode ligature. ie, "fine" => ((& "#xfb01") "ne") (define (fancify string character-map) (map (lambda (piece) (alist-ref piece character-map string=? piece)) (point-split-string string (map car character-map)))) (define (make-fancy-rules . rest) (let-optionals rest ((exceptions default-exceptions) (character-map default-map)) `(,@(map (lambda (x) `(,x *preorder* . ,(lambda x x))) exceptions) (*text* . ,(lambda (tag str) (if (string? str) (cons '*flatten* (fancify str character-map)) str))) (*default* . ,(lambda contents (flatten-strings contents)))))) ;; Structure of these lists: (pre match post how counts?) ;; pre is the part of the string that's before the quote to match, post is the ;; string that is after the match (can be empty). ;; how is one of single, double, single-open, double-open, single-close ;; or double-close. ;; counts? is a boolean describing whether the quote should influence the ;; nesting of the next quote or not. (ie, "isn't" => #f, since the ' ;; doesn't mean an opening quote is closed by the quote). (define all-quotes '(("n" "'" "t" single-close #f) ; Aren't you? ("" "'" "re" single-close #f) ; We're here ("" "'" "s" single-close #f) ; Jack's widget ("s" "'" (or " " eos) single-close #f) ; James' car ((or bos " ") "'" (seq (+ numeric) "s") single-close #f) ; The '90s ("" "\"" "" double #t) ("" "``" "" double-open #t) ("" "''" "" double-close #t) ("" "'" "" single #t))) ;; See http://www.unicode.org/charts/PDF/U2000.pdf ;; This is pretty ugly code. (define (smarten-quotes contents #!optional (quotes all-quotes) (exceptions default-exceptions)) (let ((single-open-count 0) (double-open-count 0) (big-regex (irregex `(or ,@(map (lambda (parts) `(seq (submatch ,(first parts)) (submatch ,(second parts)) (submatch ,(third parts)))) quotes))))) (let loop ((contents contents) (result '())) (cond ((null? contents) (reverse result)) ((member (car contents) exceptions) (append (reverse result) contents)) ((pair? (car contents)) (loop (cdr contents) (cons (loop (car contents) '()) result))) ((string? (car contents)) (let string-loop ((str (car contents)) (result-strings '())) (let ((match (irregex-search big-regex str))) (if (not match) (let ((string-list (append result-strings (list str)))) (if (null? string-list) (loop (cdr contents) result) (loop (cdr contents) (cons (cons '*flatten* string-list) result)))) (let* ((before (string-take str (irregex-match-start-index match 0))) ; non-matching part (after (string-drop str (irregex-match-end-index match 0))) ; non-matching part (match-pos (let lp ((pos 1)) (if (irregex-match-valid-index? match pos) pos (lp (add1 pos))))) ;; Three parts of the matching quotes (parts (car (drop quotes (quotient match-pos 3)))) (pre (irregex-match-substring match match-pos)) (post (irregex-match-substring match (+ match-pos 2))) (new-quote (case (fourth parts) ((single-open) (when (fifth parts) (set! single-open-count (add1 single-open-count))) '(& "#x2018")) ((single-close) (when (and (fifth parts) (> single-open-count 0)) (set! single-open-count (sub1 single-open-count))) '(& "#x2019")) ((double-open) (when (fifth parts) (set! double-open-count (add1 double-open-count))) '(& "#x201c")) ((double-close) (when (and (fifth parts) (> double-open-count 0)) (set! double-open-count (sub1 double-open-count))) '(& "#x201d")) ;; For the balanced ones, close it if it was open, ;; open it if it was closed ((single) (if (> single-open-count 0) (begin (when (fifth parts) (set! single-open-count (sub1 single-open-count))) '(& "#x2019")) (begin (when (fifth parts) (set! single-open-count (add1 single-open-count))) '(& "#x2018")))) ((double) (if (> double-open-count 0) (begin (when (fifth parts) (set! double-open-count (sub1 double-open-count))) '(& "#x201d")) (begin (when (fifth parts) (set! double-open-count (add1 double-open-count))) '(& "#x201c")))) (else (error 'smarten-quotes "Unkown quote matching type: " (fourth parts)))))) (string-loop after (append result-strings (list (string-append before pre) new-quote post)))))))) (else (loop (cdr contents) (cons (car contents) result))))))) ;; We have to jump through some hoops to get the SXML normalized again. ;; That's what you get when trying to map one string to a list of strings :) ;; ;; NOTE: There's probably a way to eliminate all this reversing. ;; This is inefficient. (define (flatten-strings data) (let loop ((data data) (result '())) (cond ((null? data) (reverse result)) ((not (pair? data)) (cons data result)) ((and (pair? (car data)) (eq? (caar data) '*flatten*)) (loop (cdr data) (append (reverse (loop (cdar data) '())) result))) ((pair? (car data)) (loop (cdr data) (cons (loop (car data) '()) result))) (else (loop (cdr data) (cons (car data) result)))))) ;; This is a very simple implementation, it doesn't really use pre-post-order. ;; It's handy nonetheless because you can easily integrate it into an existing ;; pre-post-order chain with sxml-apply-rules. (define (make-smart-quote-rules . rest) (let-optionals rest ((exceptions default-exceptions) (quotes all-quotes)) `((*text* . ,(lambda (tag data) data)) ;; Not needed? (*default* *preorder* . ,(lambda (tag . contents) (flatten-strings (cons tag (smarten-quotes contents quotes exceptions)))))))) )