;; todo: handle
(see abandoned blockquote branch) ;; todo: link rendering does not parse inline markup such as {{...}} (module svnwiki-sxml (svnwiki->sxml svnwiki-signature-parser svnwiki-signature->identifier ) (import scheme chicken) (require-library srfi-13 ports data-structures extras) (import extras) (import (only ports with-input-from-port with-input-from-string)) (import (only data-structures string-intersperse string-split string-translate* constantly)) (import (only srfi-13 string-trim string-trim-right string-trim-both string-concatenate-reverse string-index)) (use regex) (use matchable) (use html-parser) (import irregex) ;; (import (rename irregex (irregex irr))) ; debug ;; (define (irregex x) ;; (display "compiling ") ;; (write x) ;; (newline) ;; (time (irr x))) (define +identifier-tags+ (list "procedure" "macro" "read" "parameter" "record" "string" "class" "method" "constant" "setter" "syntax")) (define re:header (irregex '(: (submatch (>= 2 #\=)) (+ space) (submatch (+ any))))) ;; SVNWIKI interprets every character of *#*# and converts this into ;; ul ol ul ol -- if one char does not match in the next, a new list is ;; started at that nesting depth. We currently just rely on the last ;; character, and only at list open (define re:unordered-list (irregex '(: (submatch (* (or #\* #\#)) #\*) (+ space) (submatch (+ any))))) (define re:ordered-list (irregex '(: (submatch (* (or #\* #\#)) #\#) (+ space) (submatch (+ any))))) (define re:definition-list (irregex '(: #\; (* space) (submatch (* any))))) (define sre:preformatted '(: space (submatch (* any)))) (define re:preformatted (irregex sre:preformatted)) (define sre:definition-tag `(: #\< (submatch (or ,@+identifier-tags+)) #\> (submatch (: any (*? any))) " (* any))) (define sre:horizontal-rule '(: "---" (? #\-))) ; svnwiki is ---- only (define re:definition-tag (irregex sre:definition-tag)) (define re:horizontal-rule (irregex sre:horizontal-rule)) ;; This expression is very costly for irregex to compile ;; (define sre:enscript-tag-start ; crappy ;; '(: " ;; (submatch (* any)))) (define sre:enscript-tag-start '(: #\< "enscript" (submatch (* (~ #\>))) #\> (submatch (* any)))) ;; (define sre:tag-attribute ; compilation ok ;; '(: (* space) (submatch (+ alpha)) #\= #\' (submatch (+ (~ #\'))) #\')) ;; (define sre:tag-attribute ; compilation very slow ;; '(: (* space) (submatch (+ alpha)) #\= ;; (or (: #\' (submatch (+ (~ #\'))) #\') ;; (: #\" (submatch (+ (~ #\"))) #\") ;; (: (submatch (+ (~ space))))) ;; )) (define sre:tag-attribute ; trade compile time for runtime by using greedy '(: (* space) (submatch (+ alpha)) #\= ; alt: use a multi-fold and no (or ...) (or (: #\' (submatch (*? any)) #\') (: #\" (submatch (*? any)) #\") (: (submatch (+ (~ space))))))) (define re:tag-attribute (irregex sre:tag-attribute)) (define sre:nowiki-tag-start '(: "" (submatch (* any)))) (define re:nowiki-tag-start (irregex sre:nowiki-tag-start)) (define sre:table-tag-start '(: (submatch ") (submatch (* any)))) (define sre:examples-tag-start '(: "" (submatch (* any)))) (define re:examples-tag-start (irregex sre:examples-tag-start)) (define re:table-tag-start (irregex sre:table-tag-start)) (define re:enscript-tag-start (irregex sre:enscript-tag-start)) (define re:enscript-tag-end (irregex '(: (submatch (* any)) "" (submatch (* any))))) (define re:nowiki-tag-end (irregex '(: (submatch (* any)) "" (submatch (* any))))) (define re:table-tag-end (irregex '(: (submatch (* any)) "" (submatch (* any))))) (define re:examples-tag-end (irregex '(: (submatch (* any)) "" (submatch (* any))))) (define sre:directive '(: "[[" (submatch (or "tags" "toc")) ":" (submatch (* (~ #\]))) "]]")) (define re:directive (irregex sre:directive)) (define re:block ; Should probably use existing REs. (irregex `(or (: (>= 2 #\=) (+ space) (+ any)) ; header (: (+ (or #\* #\#)) (+ space) (+ any)) ; item-list (: #\; (+ any)) ; definition-list ,sre:preformatted ;; ,sre:definition-tag ; see below [*] ,sre:horizontal-rule ,sre:enscript-tag-start ,sre:nowiki-tag-start ,sre:table-tag-start ,sre:examples-tag-start ,sre:directive ))) ;; [*] Definition tag relies on backref, but "or" operator throws off ;; backref numbering indeterminately. Therefore we disable ;; definition-tag in re:block; negative effect is that it won't ;; break us out of current paragraph. One solution is to make ;; multiple irregex match calls, one per RE. Benchmark that. ;; Single-line readahead (effectively, adds peek-line to read-line) ;; This is a parameter to make it thread-local (define *buffered-line* (make-parameter #f)) (define (read-buffered-line #!optional (p (current-input-port))) ;; NOTE: buffer only works for one port and is not SRFI-18 compatible (let ((b (*buffered-line*))) (cond (b (*buffered-line* #f) b) (else (read-line p))))) (define (peek-buffered-line/normal #!optional (p (current-input-port))) (cond ((*buffered-line*)) (else (let ((line (read-line p))) (*buffered-line* line) line)))) (define (peek-buffered-line/debug #!optional (p (current-input-port))) (let ((line (peek-buffered-line/normal p))) (print "line: " line) line)) (define peek-buffered-line peek-buffered-line/normal) (define (poke-line line #!optional (p (current-input-port))) (*buffered-line* line)) (define (discard-line #!optional (p (current-input-port))) (*buffered-line* #f)) (define (section-body depth) (let ((line (peek-buffered-line))) (cond ((eof-object? line) '()) ((string-match re:header line) => (lambda (m) (let ((next-depth (string-length (cadr m))) (title (caddr m))) (cond ((> next-depth depth) (discard-line) (let ((title (match (inline title) ((t) t) ; interpolate title when possible (t t)))) (let ((sec `(section ,next-depth ,title . ,(section-body next-depth)))) (cons sec (section-body depth))))) (else '()))))) ((string=? line "") (discard-line) (section-body depth)) (else (cons (block) (section-body depth)))))) (define (block) (let ((line (peek-buffered-line))) (cond ((or (string-match re:unordered-list line) (string-match re:ordered-list line)) ; could do simple match here (item-list 0)) ((string-match re:definition-list line) (definition-list)) ((string-match re:preformatted line) (preformatted)) ((string-match re:definition-tag line) (definition-block)) ((horizontal-rule? line) => horizontal-rule) ((enscript? line) => enscript) ((nowiki? line) => nowiki) ((table? line) => table) ((examples? line) => examples) ((directive? line) => directive) ;; WARNING: If a line is not matched above but does match re:block, ;; then (paragraph) will enter an infinite loop. (else (paragraph))))) (define (directive? line) (string-match re:directive line)) (define directive (match-lambda ((_ dir args) (discard-line) (cond ((string=? dir "toc") `(toc)) ((string=? dir "tags") `(tags . ,(string-split args))))))) (define (horizontal-rule? line) (string-match re:horizontal-rule line)) (define (horizontal-rule m) (discard-line) '(hr)) (define (parse-attrs str) ; parse html attributes string, return alist (irregex-fold re:tag-attribute (lambda (i m s) (cons (cons (string->symbol (irregex-match-substring m 1)) (or (irregex-match-substring m 2) (irregex-match-substring m 3) (irregex-match-substring m 4))) s)) '() str)) (define (enscript? line) (string-match re:enscript-tag-start line)) (define enscript (match-lambda ((_ attrs ln) (discard-line) (let ((attrs (parse-attrs attrs))) (let ((lang (cond ((or (assq 'highlight attrs) (assq 'language attrs)) => (lambda (x) (string->symbol (cdr x)))) (else 'scheme)))) `(highlight ,lang ,(read-verbatim re:enscript-tag-end ln))))))) (define (nowiki? line) (string-match re:nowiki-tag-start line)) (define nowiki (match-lambda ((_ ln) (discard-line) ;; Should be interpolated into result, but we can't do that (cdr (html->sxml (read-verbatim re:nowiki-tag-end ln)))))) (define (examples? line) (string-match re:examples-tag-start line)) ;; Read block and pass through verbatim to sxml. ;; There will be some extraneous NLs. (define examples (match-lambda ((_ ln) (discard-line) ; --actually, we don't have to discard, we can just ; --allow html->sxml to read the entire thing `(examples . ,(cdr (html->sxml (read-verbatim re:examples-tag-end ln))))))) ;;; table handling (define (table? line) (string-match re:table-tag-start line)) (use sxml-transforms) (define (pre-post-order-text doc proc) (pre-post-order-splice doc `((*text* . ,(lambda (tag str) (proc str))) (*default* . ,(lambda x x))))) (define (concatenate-string-fragments L) ;; Intentional: Does not omit empty strings after collapse. (define (str-concat s) ; requires: s is pair (if (null? (cdr L)) s ; no fresh copy needed (string-concatenate-reverse s))) (define (scons s L) (if (null? s) L (cons (str-concat s) L))) (let rec ((L L) (str '())) (cond ((null? L) (scons str '())) ((pair? (car L)) (scons str (cons (rec (car L) '()) (rec (cdr L) '())))) ((string? (car L)) (rec (cdr L) (cons (car L) str))) (else (scons str (cons (car L) (rec (cdr L) '()))))))) (define table (match-lambda ((_ tag ln) (discard-line) (let* ((table-str (string-append tag (read-verbatim re:table-tag-end ln))) (table-sxml (cadr (html->sxml table-str)))) ;; Transform inline elements in strings. char entities open a ;; new string and transform does not work across string boundaries, ;; so first concatenate any adjacent string fragments in the sxml. (pre-post-order-text (concatenate-string-fragments table-sxml) inline))))) ;;; block start tag to end tag reading ;; Returns string with NL-delimited lines until end-re ;; orphaned start and end tags don't count as separate lines (define (read-verbatim end-re ln) (unless (string=? ln "") ;; special handling for orphan start tag (poke-line ln)) (string-intersperse (read-until-end-tag end-re) "\n")) (define (read-until-end-tag end-re) ; returns list of lines (let lp ((lines '())) (let ((line (read-buffered-line))) (cond ((eof-object? line) (reverse lines)) ((string-search end-re line) => (match-lambda ((_ pre post) (poke-line post) (reverse (if (string=? pre "") ; special handling for orphan end tag lines (cons pre lines)))))) ;; NOTE: Abort if we hit a new section? (else (lp (cons line lines))))))) ;;; definitions (procedures, etc.) (define (definition-block) `(def (sig . ,(definition-sigs)) . ,(definition-body))) (define de-nowikify ;; svnwiki processes HTML tags inside procedure tags. To insert a ;; literal tag requires + char entity. Here we convert ;; critical entities to regular text (without fully parsing as HTML). (let ((re (irregex '(: "" (submatch (*? any)) "")))) (define (de-entitize str) (string-translate* str '(("<" . "<") (">" . ">") (""" . "\"") ("'" . "'") ("&" . "&")))) (lambda (s) (irregex-replace/all re s (lambda (m) (de-entitize (irregex-match-substring m 1))))))) ;; (def (sig procedure "(foo bar baz)" (id foo)) ...) (define (definition-sigs) (define (definition-tag->type tag) (let ((type (string->symbol tag))) (case type ((macro) 'syntax) ;; convert old svnwiki "macro" tags to "syntax" (else type)))) (let ((line (peek-buffered-line))) (cond ((or (eof-object? line) (string=? line "")) (discard-line) '()) ((string-match re:definition-tag line) => (match-lambda ((_ tag sig) (discard-line) (let* ((type (definition-tag->type tag)) (sig (de-nowikify sig)) ; gosh i hate svnwiki sometimes (alist ((svnwiki-signature-parser) sig type))) (let ((alist (if (or (pair? alist) (null? alist)) alist `((id ,alist))))) ;; convert id to alist (cons `(,type ,sig . ,alist) (definition-sigs))))))) (else '())))) (define (definition-body) (let ((line (peek-buffered-line))) (cond ((eof-object? line) '()) ((string=? line "") (discard-line) (definition-body)) ; put in (block)? ((string-match re:header line) '()) ((string-match re:definition-tag line) '()) ((horizontal-rule? line) '()) (else (cons (block) (definition-body)))))) ;;; lists (define (item-list depth) (let ((line (peek-buffered-line))) (cond ((eof-object? line) '()) ((string=? line "") (discard-line) (item-list depth)) ((string-match re:unordered-list line) => (lambda (m) (let ((next-depth (string-length (cadr m)))) (cond ((> next-depth depth) `(ul . ,(item-list-items next-depth))) (else '()))))) ((string-match re:ordered-list line) => (lambda (m) (let ((next-depth (string-length (cadr m)))) (cond ((> next-depth depth) `(ol . ,(item-list-items next-depth))) (else '()))))) (else '())))) (define (item-list-items depth) (let ((line (peek-buffered-line))) (cond ((eof-object? line) '()) ((string=? line "") (discard-line) (item-list-items depth)) ((or (string-match re:unordered-list line) (string-match re:ordered-list line)) => (lambda (m) (let ((next-depth (string-length (cadr m))) (item (caddr m))) (cond ((> next-depth depth) (item-list next-depth)) ((= next-depth depth) (discard-line) (let ((item (item-list-item (list item)))) (let* ((next-list (item-list next-depth)) (next-list (if (pair? next-list) ; hack for proper nesting (list next-list) '()))) ;; ITEM is inline not block and should be interpolated. `((li ,@(inline item) . ,next-list) . ,(item-list-items depth))))) (else '()))))) (else '())))) ;; List items may extend across lines; read the lines until reaching a block item ;; and coalesce them. Special case: initial whitespace does not trigger a PRE block. (define (item-list-item lines) (let ((line (peek-buffered-line))) (cond ((or (eof-object? line) (string=? line "")) (discard-line) (string-intersperse (reverse lines) " ")) (else (let ((trimmed-line (string-trim-both line))) (cond ((string-match re:block trimmed-line) (string-intersperse (reverse lines) " ")) (else (discard-line) (item-list-item (cons trimmed-line lines))))))))) (define (definition-list) `(dl . ,(definition-list-items))) (define (definition-list-items) (define (break-pivot pred? then lis) ;; break list at pred? not including pivot point; (let loop ((L lis) (left '())) ;; THEN processes pivot and reassembles to (L . R) (cond ((null? L) (cons lis '())) ((pred? (car L)) => (lambda (v) (then v left (car L) (cdr L)))) (else (loop (cdr L) (cons (car L) left)))))) (define (unsplit i L p R) (define (split-char str i) ; split string at index i into cons (before . after) (cons (substring str 0 i) (substring str (+ i 1)))) (match-let (((x . y) (split-char p i))) (let ((x (string-trim-right x)) (y (string-trim y))) (cons (reverse (if (string=? x "") L (cons x L))) (if (string=? y "") R (cons y R)))))) (define (at-colon x) (and (string? x) (string-index x #\:))) (let loop ((items '())) (let ((line (peek-buffered-line))) (cond ((eof-object? line) (reverse items)) ((string=? line "") (discard-line) (loop items)) ((string-match re:definition-list line) => (match-lambda ((_ ln) (discard-line) ;; Colons inside inline markup (links, code) must be ignored. ;; We mark up the entire line, then split the list in two at ;; the first "naked" colon. (match-let (((term . def) (break-pivot at-colon unsplit (inline ln)))) (loop (if (null? def) `( (dt . ,term) . ,items) `((dd . ,def) (dt . ,term) . ,items))))))) (else (reverse items)))))) ;;; pre (define (preformatted) `(pre ,(slurp-preformatted))) (define (slurp-preformatted) (let loop ((lines '())) (let ((line (peek-buffered-line))) (cond ((or (eof-object? line) (string=? line "")) (discard-line) (string-intersperse (reverse lines) "\n")) ((string-match re:preformatted line) => (match-lambda ((_ text) (discard-line) (loop (cons text lines))))) (else (string-intersperse (reverse lines) "\n")))))) ;;; para (define (paragraph) `(p . ,(inline (slurp-paragraph)))) (define (slurp-paragraph) (let loop ((lines '())) (let ((line (peek-buffered-line))) (cond ((or (eof-object? line) (string=? line "")) (discard-line) (string-intersperse (reverse lines) " ")) ((string-match re:block line) (string-intersperse (reverse lines) " ")) (else (discard-line) (loop (cons (string-trim-both line) lines))))))) ;;; svnwiki->sxml (define (svnwiki->sxml in) (discard-line) ; clear buffer (with-input-from-port in (lambda () (section-body 1)))) #| document :: section-body section :: section-header section-body section-header :: ==+ Title section-body :: block* section>* |# ;;; Felix's wiki2html inline parser modified for sxml output (require-library srfi-1 data-structures) (import (only srfi-1 first second third find)) (import (only data-structures string-translate*)) ;; inline elements ;; (define +code+ '(: "{{" (submatch (* (? #\}) (+ (~ #\})))) "}}")) ;; sjamaan suggestion (define +code+ '(: "{{" (submatch (~ #\}) (*? any)) "}}")) (define +bold+ '(: "'''" (submatch (~ #\') (*? any)) "'''")) (define +italic+ '(: "''" (submatch (~ #\') (*? any)) "''")) (define +html-tag+ '(: #\< (submatch (+ (~ #\>))) #\>)) (define +nowiki+ '(: "" (submatch (*? any)) "")) (define +link+ '(: #\[ #\[ (submatch (* (~ #\] #\|))) (? #\| (submatch (* (~ #\])))) #\] #\])) (define +image-link+ '(: #\[ #\[ (* space) "image:" (* space) (submatch (* (~ #\] #\|))) (? #\| (submatch (* (~ #\])))) #\] #\])) (define +inline-element+ `(or ,+code+ ,+image-link+ ,+link+ ;; ,+html-tag+ ,+nowiki+ ,+bold+ ,+italic+)) (define +ext-url+ '(: (* space) (/ "AZaz") (* (or (/ "AZaz09") ("+.-"))) ;; scheme "://" (* any))) (define *manual-pages* '()) (define-constant rx irregex) ;; Parse nowiki contents as html; do no ;; further parsing for inline elements. (define (nowiki-inline str) (cdr (html->sxml str))) (define (inline/collapse str) ;; INLINE always returns a list, even for a single string. ;; INLINE/COLLAPSE checks for the single string case and lifts it out of the list. ;; This saves a little space in the SXML output when single strings are common, ;; such as in link descriptions. (let ((L (inline str))) (if (and (pair? L) (null? (cdr L)) (string? (car L))) (car L) L))) (define inline (let ((rx:inline-element (rx +inline-element+)) (rx:code (rx `(: bos ,+code+))) (rx:html-tag (rx `(: ,+html-tag+))) (rx:image-link (rx `(: bos ,+image-link+))) (rx:link (rx `(: bos ,+link+))) (rx:ext-url (rx +ext-url+)) (rx:bold (rx `(: bos ,+bold+))) (rx:italic (rx `(: bos ,+italic+))) (rx:nowiki (rx `(: bos ,+nowiki+)))) (lambda (str) (let ((m (string-search-positions rx:inline-element str))) (if (not m) (if (string=? str "") '() (list str)) (let ((before (substring str 0 (caar m))) (after (let ((rest (substring str (caar m)))) (define (continue m) (inline (substring rest (string-length (first m))))) (cond ((string-search rx:code rest) => (lambda (m) (cons `(tt ,(second m)) (continue m)))) ((string-search rx:image-link rest) => (lambda (m) (cons `(image-link ,(second m)) (continue m)))) ((string-search rx:link rest) => (lambda (m) (let ((href (string-trim-both (second m))) ; ? (desc (third m))) (cons (cond ((string-match rx:ext-url href) (if desc `(link ,href ,(inline/collapse desc)) `(link ,href))) ;; Note: internal links ("int-link") make no ;; sense outside of the wiki unless they refer directly ;; to an identifier or egg name. ;; Note: Ideally handle #xxx links as node pointers. ;; But hard to ensure a node is meant. (else (if desc `(int-link ,href ,(inline/collapse desc)) `(int-link ,href)))) (continue m))))) ((string-search rx:bold rest) => (lambda (m) (cons `(b . ,(inline (second m))) (continue m)))) ((string-search rx:italic rest) => (lambda (m) (cons `(i . ,(inline (second m))) (continue m)))) ((string-search rx:nowiki rest) => (lambda (m) (append (nowiki-inline (second m)) (continue m)))) (else (error "unknown inline match" m rest)))))) (if (string=? before "") after (cons before after))) ) )))) ;;; signature parsing ;; Function of two args, SIG and TYPE, called for each definition. Should return either a pair ;; (an alist) or an identifier. This result is inserted into the ;; defsig clause. The only current defined alist key is "id", whose ;; value is the identifier for this signature, or #f on parse failure. ;; Non-pair returns (identifiers) are changed into the alist `((id ,id)). ;; A null alist indicates no processing was done at svnwiki parse time, ;; effectively indicating the user should parse it if desired. (define svnwiki-signature-parser (make-parameter (constantly '()))) ;; Convert signature string (usually a list or bare identifier) into an identifier. ;; Returns a symbol, a string, or #f. ;; We read the string with the scheme reader. If a list, take the car recursively. ;; If a symbol, return the symbol. If any other scheme object, return the whole ;; signature string. If it cannot be read as a scheme expression, return #f. ;; As a special case, read syntax returns the entire signature without trying ;; to read it first; note this may cause a problem with older Chicken versions ;; due to r/w variance with symbols starting with #. ;; Also note that results of parsing a keyword identifier are undefined. (define svnwiki-signature->identifier (let ((+rx:ivanism+ (irregex '(: ":" eos)))) (lambda (sig type) (if (eq? type 'read) sig (condition-case (let loop ((id (with-input-from-string sig read)) (in-pair #f)) (cond ((pair? id) (loop (car id) #t)) ((keyword? id) (if in-pair id ;; Returned as symbol, but ->string will drop any colons. ;; SPECIAL HANDLING: handle e.g. MPI:init:: -> MPI:init, ;; only at toplevel of signature (not nested in a pair). ;; Remove this once these signatures are normalized. ;; (Warning: when read as a keyword, symbol->string ;; will strip one : itself). We assume keywords are ;; in suffix style. (let ((str (irregex-replace +rx:ivanism+ (symbol->string id) ""))) (if str (string->symbol str) id)))) ((symbol? id) id) (else sig))) ((exn) #f)))))) )