; SSAX parsing of a (large) XML document and recording only
; `interesting' parts of that document.
;
; The problem was posed by Tobias Gerdin in his message to the
; SSAX-SXML mailing list on Thu, 4 Jan 2007 19:24:02 +0100:
;
;
; I have a very large document essentially containing a
; list of entries (represented as ... tags, but with
; lots of nested tags inside). Inside each entry I want to extract two
; #PCDATA sections, namely the content in a element and a
; element (in particular, I want to parse this file: ftp://
; ftp.monash.edu.au/pub/nihongo/JMdict.gz, more information the
; document is found at http://www.csse.monash.edu.au/~jwb/j_jmdict.html).
;
; I've done some experimenting but haven't found any nice solution to
; this elementary problem. Since the document is quite large (39MB) I
; don't think I'd like the seed to contain information on the two tags
; from all the entries, instead I'd thought that for each entry I'd
; pass the information (the two CDATA sections) to another procedure
; that inserts it into another data structure, and then reset the seed
; for the next entry.
;
; So, to some it up: How do I extract CDATA sections from arbitrary
; tags in a nice way?
;
;
; $Id: ssax-extraction.scm,v 1.1 2007/01/06 12:48:05 oleg Exp $
; First we define the data structure that contains the extracted data.
; It is just a list whose elements is a list of two elements
; (keb-tags gloss-tags)
; An may have several keb and gloss elements, thus plural
(define empty-final-ds '())
(define (add-final-ds-entry kebs glosses ds) (cons (list kebs glosses) ds))
; define the entities that may occur in our document
; The full list is in JMDICT DTD
; http://www.csse.monash.edu.au/~jwb/jmdict_dtd_h.html
(define entities
'((n . "noun (common) (futsuumeishi)")))
; A sample document, based roughly on
; http://www.csse.monash.edu.au/~jwb/jmdict_sample.html
; after simplifying and fixing some problems
(define doc
(string-concatenate/shared
'(""
""
"1171270"
"nekoichi1"
""
"nekoichi1"
""
""
"&n;"
"cat"
""
""
""
"1171271"
"inuichi1"
""
"inuichi1"
""
"ken"
""
""
"&n;"
"dog"
""
""
"&n;"
"Canis (lupus) familiaris"
""
""
"")))
; The parsing process uses several seeds.
; The initial seed, which is the parent seed to the `entry'
; element, is the final-ds. It accumulates the results from parsing
; entries.
; When parsing children of an , the seed is different. It is defined
; as a record below (we use records for clarity, and we use Petite
; Chez records).
; char-data is the accumulator for character data, or #f is we skip
; char data (of elements whose content we don't care about)
; kebs is the list of char data from the kebs elements so far
; glosses is the same for glosses. All the interface is functional...
(define-record entry-seed
((immutable char-data) (immutable kebs) (immutable glosses)) ())
(define new-entry-seed (make-entry-seed #f '() '()))
(define (accumulate-char-data eseed)
(make-entry-seed '() (entry-seed-kebs eseed)
(entry-seed-glosses eseed)))
; Don't accumulate char data why processing an element
(define (disregard-char-data eseed)
(make-entry-seed #f (entry-seed-kebs eseed)
(entry-seed-glosses eseed)))
(define (set-char-data cdata eseed)
(make-entry-seed cdata (entry-seed-kebs eseed)
(entry-seed-glosses eseed)))
(define (add-kebs keb eseed)
(make-entry-seed #f (cons keb (entry-seed-kebs eseed))
(entry-seed-glosses eseed)))
(define (add-glosses gloss eseed)
(make-entry-seed #f (entry-seed-kebs eseed)
(cons gloss (entry-seed-glosses eseed))))
; For clarity, the following procedure has debug output: (cout ...)
; to make it clear how the seed propagates...
; Take ssax:xml->sxml and modify it a little
; The resulting procedure takes a port and the data structure to
; store the final results in
(define custom-parser
(ssax:make-parser
NEW-LEVEL-SEED
(lambda (elem-gi attributes namespaces
expected-content seed)
(cout "entering: " elem-gi nl)
(case elem-gi
((entry) ; entering an entry...
new-entry-seed)
((keb gloss) ; switch on accum of char content
(accumulate-char-data seed))
(else seed)))
FINISH-ELEMENT
(lambda (elem-gi attributes namespaces parent-seed seed)
(cout "exiting: " elem-gi "," seed nl)
(case elem-gi
((entry)
; leaving
; parent-seed is final-ds. Add data from entry-seed
(add-final-ds-entry
(entry-seed-kebs seed)
(entry-seed-glosses seed)
parent-seed))
((keb)
(add-kebs
(ssax:reverse-collect-str-drop-ws
(entry-seed-char-data seed))
seed))
((gloss)
(add-glosses
(ssax:reverse-collect-str-drop-ws
(entry-seed-char-data seed))
seed))
(else seed)))
CHAR-DATA-HANDLER
(lambda (string1 string2 seed)
(let ((prev-char-data
(entry-seed-char-data seed)))
(if prev-char-data
(set-char-data
(if (string-null? string2) (cons string1 prev-char-data)
(cons* string2 string1 prev-char-data))
seed)
seed))) ; don't accumulate char data
DOCTYPE
(lambda (port docname systemid internal-subset? seed)
(when internal-subset?
(ssax:warn port
"Internal DTD subset is not currently handled ")
(ssax:skip-internal-dtd port))
(ssax:warn port "DOCTYPE DECL " docname " "
systemid " found and skipped")
(values #f entities '() seed))
UNDECL-ROOT
(lambda (elem-gi seed)
(values #f entities '() seed))
PI
((*DEFAULT* .
(lambda (port pi-tag seed)
; read PI's body and silently disregard it
(let ((dummy (ssax:read-pi-body-as-string port)))
seed))))
))
(cout "About to parse the document" nl)
(cout "Result" nl
(call-with-input-string doc
(lambda (port)
(custom-parser port empty-final-ds)))
nl
"Done" nl)