;;
;; mbox parser combinators specialized for strings.
;;
;; Copyright 2009-2011 Ivan Raikov and the Okinawa Institute of Science
;; and Technology.
;;
;;
;; This program is free software: you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation, either version 3 of
;; the License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; A full copy of the GPL license can be found at
;; .
(module mbox-string
(
mbox-file->messages
message? message-envelope message-headers message-body
)
(import scheme chicken foreign )
(require-extension utils data-structures srfi-1 srfi-13 typeclass input-classes)
(require-library abnf internet-message mbox)
(import (only abnf
CharLex->CoreABNF Input->Token
Token->CharLex
)
(only internet-message CoreABNF->InetMessage)
(only mbox Input+.CoreABNF->Mbox message? message-envelope message-headers message-body )
)
;;
;;
;; Fast sub-sequence search, based on work by Boyer, Moore, Horspool,
;; Sunday, and Lundh.
;;
;; Based on code from the Haskell text library by Tom Harper and Bryan
;; O'Sullivan. http://hackage.haskell.org/package/text
;;
;;
;; References:
;;
;; * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
;; Communications of the ACM, 20, 10, 762-772 (1977)
;;
;; * R. N. Horspool: Practical Fast Searching in Strings. Software -
;; Practice and Experience 10, 501-506 (1980)
;;
;; * D. M. Sunday: A Very Fast Substring Search Algorithm.
;; Communications of the ACM, 33, 8, 132-142 (1990)
;;
;; * F. Lundh: The Fast Search Algorithm.
;; (2006)
;;
;; From http://effbot.org/zone/stringlib.htm:
;;
;; When designing the new algorithm, I used the following constraints:
;;
;; * should be faster than the current brute-force algorithm for
;; all test cases (based on real-life code), including Jim
;; Hugunin’s worst-case test
;;
;; * small setup overhead; no dynamic allocation in the fast path
;; (O(m) for speed, O(1) for storage)
;;
;; * sublinear search behaviour in good cases (O(n/m))
;;
;; * no worse than the current algorithm in worst case (O(nm))
;;
;; * should work well for both 8-bit strings and 16-bit or 32-bit
;; Unicode strings (no O(σ) dependencies)
;;
;; * many real-life searches should be good, very few should be
;; worst case
;;
;; * reasonably simple implementation
;;
;; This rules out most standard algorithms (Knuth-Morris-Pratt is not
;; sublinear, Boyer-Moore needs tables that depend on both the
;; alphabet size and the pattern size, most Boyer-Moore variants need
;; tables that depend on the pattern size, etc.).
;;
;; After some tweaking, I came up with a simplication of Boyer-Moore,
;; incorporating ideas from Horspool and Sunday. Here’s an outline:
;;
;; def find(s, p):
;; # find first occurrence of p in s
;; n = len(s)
;; m = len(p)
;; skip = delta1(p)[p[m-1]]
;; i = 0
;; while i <= n-m:
;; if s[i+m-1] == p[m-1]: # (boyer-moore)
;; # potential match
;; if s[i:i+m-1] == p[:m-1]:
;; return i
;; if s[i+m] not in p:
;; i = i + m + 1 # (sunday)
;; else:
;; i = i + skip # (horspool)
;; else:
;; # skip
;; if s[i+m] not in p:
;; i = i + m + 1 # (sunday)
;; else:
;; i = i + 1
;; return -1 # not found
;;
;; The delta1(p)[p[m-1]] value is simply the Boyer-Moore delta1 (or
;; bad-character skip) value for the last character in the pattern.
;;
;; For the s[i+m] not in p test, I use a 32-bit bitmask, using the 5
;; least significant bits of the character as the key. This could be
;; described as a simple Bloom filter.
;;
;; Note that the above Python code may access s[n], which would result in
;; an IndexError exception. For the CPython implementation, this is not
;; really a problem, since CPython adds trailing NULL entries to both
;; 8-bit and Unicode strings.
;;
;; /O(n+m)/ Find the offsets of all non-overlapping indices of
;; needle within haystack.
;;
;; In (unlikely) bad cases, this algorithm's complexity degrades
;; towards /O(n*m)/.
;;
(define swizzle
(foreign-lambda* int ((char k))
#<= i nlast)
(values (bitwise-ior msk (swizzle z)) skp))
(else
(let* ((c (nindex i))
(skp1 (cond ((char=? c z) (- nlen i 2))
(else skp))))
(loop (+ 1 i) (bitwise-ior msk (swizzle c)) skp1)))))))
(define (scan1 hindex hlen c)
(let loop ((i 0) (ax '()))
(cond ((>= i hlen) (reverse ax))
((char=? (hindex i) c) (loop (+ 1 i) (cons i ax)))
(else (loop (+ 1 i) ax)))))
(define (scan nindex hindex nlast nlen ldiff z mask skip i)
(define (candidate-match i j)
(cond ((>= j nlast) #t)
((not (char=? (hindex (+ i j)) (nindex j))) #f)
(else (candidate-match i (+ 1 j)))))
(let loop ((i i) (ax '()))
(if (>= i ldiff) (reverse ax)
(let ((c (hindex (+ i nlast))))
(cond
;;
((and (char=? c z) (candidate-match i 0))
(loop (+ i nlen) (cons i ax)))
;;
(else
(let* ((next-in-pattern?
(zero? (bitwise-and mask (swizzle (hindex (+ i nlen))))))
(delta (cond (next-in-pattern? (+ 1 nlen))
((char=? c z) (+ 1 skip))
(else 1))))
(loop (+ i delta) ax))))))))
(define (subsequence-search needle haystack)
(let ((nobj needle)
(nlen (string-length needle))
(hobj haystack)
(hlen (string-length haystack)))
(let* ((nindex (lambda (k)
(string-ref nobj k)))
(hindex (lambda (k)
(string-ref hobj k)))
(ldiff (- hlen nlen))
(nlast (- nlen 1))
(z (nindex nlast))
(tbl (make-table nlast nindex nlen z)))
(let-values
(((mask skip) (tbl 0 0 (- nlen 2))))
(cond ((= 1 nlen)
(scan1 hindex hlen (nindex 0)))
((or (<= nlen 0) (negative? ldiff))
'())
(else
(scan nindex hindex nlast nlen ldiff z mask skip 0)))))))
;;
;; Based on code from the Haskell text library by Tom Harper and Bryan
;; O'Sullivan. http://hackage.haskell.org/package/text
;;
;; /O(n+m)/ Find all non-overlapping instances of needle in
;; haystack. The first element of the returned pair is the prefix
;; of haystack prior to any matches of needle. The second is a
;; list of pairs.
;;
;; The first element of each pair in the list is a span from the
;; beginning of a match to the beginning of the next match, while the
;; second is a span from the beginning of the match to the end of the
;; input.
;;
;; Examples:
;;
;; > find "::" ""
;; > ==> ("", [])
;; > find "/" "a/b/c/d"
;; > ==> ("a", [("/b","/b/c/d"), ("/c","/c/d"), ("/d","/d")])
;;
;; In (unlikely) bad cases, this function's time complexity degrades
;; towards /O(n*m)/.
;; find :: Text * Text -> (Text, [(Text, Text)])
(define (string-find needle haystack)
(cond ((string-null? needle)
(error 'find "empty pattern" needle))
(else
(let ((r (subsequence-search needle haystack)))
(cond ((null? r)
(list haystack '()))
(else
(let* ((hlen (string-length haystack))
(chunk (lambda (n l)
(string-copy haystack n l)))
(go (lambda (s xs)
(let loop ((s s) (xs xs) (ax '()))
(if (null? xs)
(let ((c (chunk s hlen)))
(reverse (cons (list c c) ax)))
(let ((x (car xs)) (xs (cdr xs)))
(loop x xs
(cons (list (chunk s x)
(chunk s hlen))
ax)))))))
)
(list (chunk 0 (car r))
(go (car r) (cdr r)))))))
)))
(define (string-car x) (string-ref x 0))
(define (string-cdr x) (string-drop x 1))
(define string-
(make- string-null? string-car string-cdr))
(define string-
(Input->Token string-))
(define string-
(Token->CharLex string-))
(define string-
(CharLex->CoreABNF string-))
(define string-
(make- string-
string-find
identity
(lambda (x . rest) (read-all x))
))
(define string-
(Input+.CoreABNF->Mbox string-
string-
))
(import-instance ( string-) )
)