;; ;; mbox parser combinators specialized for strings. ;; ;; Copyright 2009-2011 Ivan Raikov and the Okinawa Institute of Science ;; and Technology. ;; ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of ;; the License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; . (module mbox-string ( mbox-file->messages message? message-envelope message-headers message-body ) (import scheme chicken foreign ) (require-extension utils data-structures srfi-1 srfi-13 typeclass input-classes) (require-library abnf internet-message mbox) (import (only abnf CharLex->CoreABNF Input->Token Token->CharLex ) (only internet-message CoreABNF->InetMessage) (only mbox Input+.CoreABNF->Mbox message? message-envelope message-headers message-body ) ) ;; ;; ;; Fast sub-sequence search, based on work by Boyer, Moore, Horspool, ;; Sunday, and Lundh. ;; ;; Based on code from the Haskell text library by Tom Harper and Bryan ;; O'Sullivan. http://hackage.haskell.org/package/text ;; ;; ;; References: ;; ;; * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm. ;; Communications of the ACM, 20, 10, 762-772 (1977) ;; ;; * R. N. Horspool: Practical Fast Searching in Strings. Software - ;; Practice and Experience 10, 501-506 (1980) ;; ;; * D. M. Sunday: A Very Fast Substring Search Algorithm. ;; Communications of the ACM, 33, 8, 132-142 (1990) ;; ;; * F. Lundh: The Fast Search Algorithm. ;; (2006) ;; ;; From http://effbot.org/zone/stringlib.htm: ;; ;; When designing the new algorithm, I used the following constraints: ;; ;; * should be faster than the current brute-force algorithm for ;; all test cases (based on real-life code), including Jim ;; Hugunin’s worst-case test ;; ;; * small setup overhead; no dynamic allocation in the fast path ;; (O(m) for speed, O(1) for storage) ;; ;; * sublinear search behaviour in good cases (O(n/m)) ;; ;; * no worse than the current algorithm in worst case (O(nm)) ;; ;; * should work well for both 8-bit strings and 16-bit or 32-bit ;; Unicode strings (no O(σ) dependencies) ;; ;; * many real-life searches should be good, very few should be ;; worst case ;; ;; * reasonably simple implementation ;; ;; This rules out most standard algorithms (Knuth-Morris-Pratt is not ;; sublinear, Boyer-Moore needs tables that depend on both the ;; alphabet size and the pattern size, most Boyer-Moore variants need ;; tables that depend on the pattern size, etc.). ;; ;; After some tweaking, I came up with a simplication of Boyer-Moore, ;; incorporating ideas from Horspool and Sunday. Here’s an outline: ;; ;; def find(s, p): ;; # find first occurrence of p in s ;; n = len(s) ;; m = len(p) ;; skip = delta1(p)[p[m-1]] ;; i = 0 ;; while i <= n-m: ;; if s[i+m-1] == p[m-1]: # (boyer-moore) ;; # potential match ;; if s[i:i+m-1] == p[:m-1]: ;; return i ;; if s[i+m] not in p: ;; i = i + m + 1 # (sunday) ;; else: ;; i = i + skip # (horspool) ;; else: ;; # skip ;; if s[i+m] not in p: ;; i = i + m + 1 # (sunday) ;; else: ;; i = i + 1 ;; return -1 # not found ;; ;; The delta1(p)[p[m-1]] value is simply the Boyer-Moore delta1 (or ;; bad-character skip) value for the last character in the pattern. ;; ;; For the s[i+m] not in p test, I use a 32-bit bitmask, using the 5 ;; least significant bits of the character as the key. This could be ;; described as a simple Bloom filter. ;; ;; Note that the above Python code may access s[n], which would result in ;; an IndexError exception. For the CPython implementation, this is not ;; really a problem, since CPython adds trailing NULL entries to both ;; 8-bit and Unicode strings. ;; ;; /O(n+m)/ Find the offsets of all non-overlapping indices of ;; needle within haystack. ;; ;; In (unlikely) bad cases, this algorithm's complexity degrades ;; towards /O(n*m)/. ;; (define swizzle (foreign-lambda* int ((char k)) #<= i nlast) (values (bitwise-ior msk (swizzle z)) skp)) (else (let* ((c (nindex i)) (skp1 (cond ((char=? c z) (- nlen i 2)) (else skp)))) (loop (+ 1 i) (bitwise-ior msk (swizzle c)) skp1))))))) (define (scan1 hindex hlen c) (let loop ((i 0) (ax '())) (cond ((>= i hlen) (reverse ax)) ((char=? (hindex i) c) (loop (+ 1 i) (cons i ax))) (else (loop (+ 1 i) ax))))) (define (scan nindex hindex nlast nlen ldiff z mask skip i) (define (candidate-match i j) (cond ((>= j nlast) #t) ((not (char=? (hindex (+ i j)) (nindex j))) #f) (else (candidate-match i (+ 1 j))))) (let loop ((i i) (ax '())) (if (>= i ldiff) (reverse ax) (let ((c (hindex (+ i nlast)))) (cond ;; ((and (char=? c z) (candidate-match i 0)) (loop (+ i nlen) (cons i ax))) ;; (else (let* ((next-in-pattern? (zero? (bitwise-and mask (swizzle (hindex (+ i nlen)))))) (delta (cond (next-in-pattern? (+ 1 nlen)) ((char=? c z) (+ 1 skip)) (else 1)))) (loop (+ i delta) ax)))))))) (define (subsequence-search needle haystack) (let ((nobj needle) (nlen (string-length needle)) (hobj haystack) (hlen (string-length haystack))) (let* ((nindex (lambda (k) (string-ref nobj k))) (hindex (lambda (k) (string-ref hobj k))) (ldiff (- hlen nlen)) (nlast (- nlen 1)) (z (nindex nlast)) (tbl (make-table nlast nindex nlen z))) (let-values (((mask skip) (tbl 0 0 (- nlen 2)))) (cond ((= 1 nlen) (scan1 hindex hlen (nindex 0))) ((or (<= nlen 0) (negative? ldiff)) '()) (else (scan nindex hindex nlast nlen ldiff z mask skip 0))))))) ;; ;; Based on code from the Haskell text library by Tom Harper and Bryan ;; O'Sullivan. http://hackage.haskell.org/package/text ;; ;; /O(n+m)/ Find all non-overlapping instances of needle in ;; haystack. The first element of the returned pair is the prefix ;; of haystack prior to any matches of needle. The second is a ;; list of pairs. ;; ;; The first element of each pair in the list is a span from the ;; beginning of a match to the beginning of the next match, while the ;; second is a span from the beginning of the match to the end of the ;; input. ;; ;; Examples: ;; ;; > find "::" "" ;; > ==> ("", []) ;; > find "/" "a/b/c/d" ;; > ==> ("a", [("/b","/b/c/d"), ("/c","/c/d"), ("/d","/d")]) ;; ;; In (unlikely) bad cases, this function's time complexity degrades ;; towards /O(n*m)/. ;; find :: Text * Text -> (Text, [(Text, Text)]) (define (string-find needle haystack) (cond ((string-null? needle) (error 'find "empty pattern" needle)) (else (let ((r (subsequence-search needle haystack))) (cond ((null? r) (list haystack '())) (else (let* ((hlen (string-length haystack)) (chunk (lambda (n l) (string-copy haystack n l))) (go (lambda (s xs) (let loop ((s s) (xs xs) (ax '())) (if (null? xs) (let ((c (chunk s hlen))) (reverse (cons (list c c) ax))) (let ((x (car xs)) (xs (cdr xs))) (loop x xs (cons (list (chunk s x) (chunk s hlen)) ax))))))) ) (list (chunk 0 (car r)) (go (car r) (cdr r))))))) ))) (define (string-car x) (string-ref x 0)) (define (string-cdr x) (string-drop x 1)) (define string- (make- string-null? string-car string-cdr)) (define string- (Input->Token string-)) (define string- (Token->CharLex string-)) (define string- (CharLex->CoreABNF string-)) (define string- (make- string- string-find identity (lambda (x . rest) (read-all x)) )) (define string- (Input+.CoreABNF->Mbox string- string- )) (import-instance ( string-) ) )