; SSAX parsing with limited XML doctype validation
; and datatype conversion
;
; XML Schema introduces a number of atomic XML datatypes and the
; corresponding validation rules. SXML likewise supports boolean,
; numerical, and other Scheme values [1] in the "character" content of
; SXML elements and attributes. As we parse the document, we can
; validate and de-serialize character data from the source XML
; document into the corresponding numerical etc. Scheme values. The
; present code shows an example of such datatype conversion and
; validation. This code also demonstrates element content validation:
; making sure that the sequence of child elements matches a
; user-specified template.
;
; The present code instantiates the SSAX parser framework to support a
; limited document type validation and datatype conversion. The only
; changes from ssax:xml->sxml concern procedures UNDECL-ROOT,
; NEW-LEVEL-SEED and FINISH-ELEMENT. We also add a function
; validate-content. The tests at the end of the file demonstrate the
; datatype conversion and the detection of invalid atomic and
; structural XML fragments.
;
; [1] One may use any Scheme value in the "character" content of an
; SXML element or attribute as long as this value is not a list or a
; symbol. If we need to insert a list or a symbol in the "character"
; content, we should wrap them in a vector or a procedure.
;
; $Id: validate-doctype-simple.scm,v 1.5 2006/08/30 23:23:39 oleg Exp $
; IMPORT
; The following is a Bigloo-specific module declaration.
; Other Scheme systems have something similar.
;
; See the Makefile for the rules to run this example on Bigloo, SCM
; and other systems
; (module validate-xml
; (include "myenv-bigloo.scm")
; (include "srfi-13-local.scm") ; or import from SRFI-13 if available
; (include "char-encoding.scm")
; (include "util.scm")
; (include "look-for-str.scm")
; (include "input-parse.scm")
; (include "catch-error.scm")
; (include "SSAX-code.scm")
; )
; procedure: validate-content EXPECTED-CONTENT SEED -> CONVERTED-SEED
; Verify the content represented by the SEED against the EXPECTED-CONTENT.
; If the content verifies, we optionally do a type-specific transformation:
; de-serialization. Verification failure is an error.
;
; At present, we only check for the following EXPECTED-CONTENT:
; 'bool: SEED must be a one-element list containing a
; case-insensitive string "T", "F", "Y" or "N"
; The string is converted to a boolean in the obvious way.
; 'int: SEED must be a one-element list containing a string
; convertible to an integer. We do the conversion.
; (seq tag1 tag2 ...)
; SEED must be a sequence of elements tag1 tag2 ...
;
; We accept all other content.
(define (validate-content expected-content seed)
(cout "validating seed: " seed " against the expected content "
expected-content nl)
(case expected-content
((bool)
(assert (pair? seed) (null? (cdr seed)))
(cond
((string-ci=? "T" (car seed)) (list #t))
((string-ci=? "Y" (car seed)) (list #t))
((string-ci=? "F" (car seed)) (list #f))
((string-ci=? "N" (car seed)) (list #f))
(else (error "Wrong content for the type bool: " seed))))
((int)
(assert (pair? seed) (null? (cdr seed)))
(let ((val (string->integer (car seed) 0 (string-length (car seed)))))
(or val
(error "Wrong content for the type int: " seed))
(list val)))
(else
(cond
((and (pair? expected-content)
(eq? 'seq (car expected-content)))
; validate (seq tag1 ...)
(or (equal?
(cdr expected-content)
(map (lambda (elem)
(and (pair? elem) (car elem))) seed))
(error "Wrong content for the type " expected-content
": " seed))
seed)
; accept everything else
(else seed)))))
; procedure: ssax:typedxml->sxml PORT NAMESPACE-PREFIX-ASSIG ELEMS -> SXML
;
; An instance of a SSAX parser that reads an XML document from PORT
; and returns the corresponding SXML tree. On return, PORT will point
; to the first character after the root element.
; NAMESPACE-PREFIX-ASSIG is a list of (USER-PREFIX . URI-STRING)
; that assigns USER-PREFIXes to certain namespaces identified by
; particular URI-STRINGs. The argument may be an empty list.
;
; ELEMS is either #f or a list of constraints to validate the
; input XML document against. Each item of ELEMS has the form:
; (elem-name elem-content decl-attrs)
; elem-name is an UNRES-NAME for the element.
; elem-content is an ELEM-CONTENT-MODEL.
; decl-attrs is an ATTLIST, of (ATTR-NAME . VALUE) associations
;
; If ELEMS is non-#f, each element in the input XML document must
; be described by the corresponding constraint.
; ELEM-CONTENT-MODEL specifies the expected type of the element content.
; In the present code, ELEM-CONTENT-MODEL is one of the following:
; A symbol:
; ANY - anything goes
; EMPTY-TAG - no content
; EMPTY - no content
; PCDATA - expect character data only, and no children elements
; bool - expect a character string representing a boolean
; int - expect a character string representing an integer
; A list:
; (seq tag1 tag2 ...) - expect a sequence of child elements
; tag1 tag2 ...
;
; See the function validate-content above for more details on the
; last three choices for ELEM-CONTENT-MODEL. See the description
; of the ELEM-CONTENT-MODEL datatype in SSAX.scm.
(define (ssax:typedxml->sxml port namespace-prefix-assig elem-types)
(letrec
((namespaces
(map (lambda (el)
(cons* #f (car el) (ssax:uri-string->symbol (cdr el))))
namespace-prefix-assig))
(res-name->sxml
(lambda (res-name)
(string->symbol
(string-append
(symbol->string (car res-name))
":"
(symbol->string (cdr res-name))))))
)
(let ((result
(reverse
((ssax:make-parser
NEW-LEVEL-SEED
(lambda (elem-gi attributes namespaces
expected-content seed)
(list (cons '*expected-type* expected-content)))
FINISH-ELEMENT
(lambda (elem-gi attributes namespaces parent-seed seed)
(let*
((seed-raw (ssax:reverse-collect-str-drop-ws seed))
(attrs
(attlist-fold
(lambda (attr accum)
(cons (list
(if (symbol? (car attr)) (car attr)
(res-name->sxml (car attr)))
(cdr attr)) accum))
'() attributes))
(seed
(if (and (pair? seed-raw) (pair? (car seed-raw))
(eq? '*expected-type* (caar seed-raw)))
(validate-content (cdar seed-raw) (cdr seed-raw))
seed-raw)))
(cons
(cons
(if (symbol? elem-gi) elem-gi
(res-name->sxml elem-gi))
(if (null? attrs) seed
(cons (cons '@ attrs) seed)))
parent-seed)))
CHAR-DATA-HANDLER
(lambda (string1 string2 seed)
(if (string-null? string2) (cons string1 seed)
(cons* string2 string1 seed)))
DOCTYPE
(lambda (port docname systemid internal-subset? seed)
(when internal-subset?
(ssax:warn port
"Internal DTD subset is not currently handled ")
(ssax:skip-internal-dtd port))
(ssax:warn port "DOCTYPE DECL " docname " "
systemid " found and skipped")
(values #f '() namespaces seed))
UNDECL-ROOT
(lambda (elem-gi seed)
(values elem-types '() namespaces seed))
PI
((*DEFAULT* .
(lambda (port pi-tag seed)
(cons
(list '*PI* pi-tag (ssax:read-pi-body-as-string port))
seed))))
)
port '()))))
(cons '*TOP*
(if (null? namespace-prefix-assig) result
(cons
(list '@ (cons '*NAMESPACES*
(map (lambda (ns) (list (car ns) (cdr ns)))
namespace-prefix-assig)))
result)))
)))
(define (test strs elem-types expected-result)
; This function is a standard equal? predicate with one exception.
; On Scheme systems where (string->symbol "A") and a symbol A
; are the same, equal_? is precisely equal?
; On other Scheme systems, we compare symbols disregarding their case.
; Since this function is used only in tests, we don't have to
; strive to make it efficient.
(define (equal_? e1 e2)
(if (eq? 'A (string->symbol "A")) (equal? e1 e2)
(cond
((symbol? e1)
(and (symbol? e2)
(string-ci=? (symbol->string e1) (symbol->string e2))))
((pair? e1)
(and (pair? e2)
(equal_? (car e1) (car e2)) (equal_? (cdr e1) (cdr e2))))
((vector? e1)
(and (vector? e2) (equal_? (vector->list e1) (vector->list e2))))
(else
(equal? e1 e2)))))
(let ((str (string-concatenate/shared strs)))
(newline) (display "input: ") (write str) (newline)
(let ((result
(call-with-input-string str
(lambda (port)
(ssax:typedxml->sxml port '() elem-types)))))
(display "Result: ")
(pp result)
(assert (equal_? result expected-result))
(newline))))
; This silly functions makes symbols case-sensitive
(define (symbolize exp)
(cond
((string? exp) (string->symbol exp))
((pair? exp) (cons (symbolize (car exp)) (symbolize (cdr exp))))
(else exp)))
; Simple test, no validation
(test '("
") #f
(symbolize '("*TOP*" ("BR"))))
(test
`(""
"Tn" ,nl
"")
; The doctype
(symbolize '(("FLAGS" "ANY" ()) ("FLAG" "bool" ())))
; The result
(symbolize '("*TOP*" ("FLAGS" ("FLAG" #t) ("FLAG" #f)))))
; The content of an element FLAG is of a wrong type
(assert
(failed?
(test
`(""
; we see bad content: xxx is not a boolean
"Txxx" ,nl
"")
(symbolize '(("FLAGS" "ANY" ()) ("FLAG" "bool" ())))
'())))
; Test for boolean and int datatype conversions
; Note that the resulting SXML contains a boolean #t and an
; integer number 10, rather than corresponding strings.
(test
`(""
"y"
"10" ,nl
"")
; The doctype
(symbolize '(("T" "ANY" ()) ("FLAG" "bool" ()) ("AMOUNT" "int" ())))
; The result
(symbolize '("*TOP*" ("T" ("FLAG" #t) ("AMOUNT" 10)))))
; check that the error is generated given an improper content for AMOUNT
(assert
(failed?
(test
`(""
"y"
"10xx" ,nl
"")
(symbolize '(("T" "ANY" ()) ("FLAG" "bool" ()) ("AMOUNT" "int" ())))
'())))
; Test for the proper sequence. Doctype specifies that element T must
; first contain AMOUNT and then FLAG. In the actual document, the child
; elements are switched. This is an error.
(assert
(failed?
(test
`(""
"T"
"10" ,nl
"")
(symbolize '(("T" ("seq" "AMOUNT" "FLAG") ())
("FLAG" "bool" ()) ("AMOUNT" "int" ())))
'())))
; Test for the an undeclared element: the doctype below does not
; define the FLAG element.
(assert
(failed?
(test
`(""
"T"
"10" ,nl
"")
; The doctype
(symbolize '(("T" ("seq" "AMOUNT" "FLAG") ()) ("AMOUNT" "int" ())))
'())))
; The successful test: validation of both structural and datatype
; constraints.
(test
`(""
"T"
"10" ,nl
"")
; The doctype
(symbolize '(("T" ("seq" "FLAG" "AMOUNT") ())
("FLAG" "bool" ()) ("AMOUNT" "int" ())))
; The result
(symbolize '("*TOP*" ("T" ("FLAG" #t) ("AMOUNT" 10)))))
(cout nl "All tests passed" nl)