;; ;; Utility procedures for manipulating blobs as byte sequences. ;; ;; Copyright 2009-2018 Ivan Raikov. ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of ;; the License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; . (module byte-blob (byte-blob? byte-blob-empty? byte-blob-length byte-blob-empty blob->byte-blob list->byte-blob string->byte-blob file->byte-blob byte-blob-replicate byte-blob->blob byte-blob-offset byte-blob-cons byte-blob-car byte-blob-cdr byte-blob-ref byte-blob-uref byte-blob-set! byte-blob-uset! byte-blob-append byte-blob-reverse byte-blob-intersperse byte-blob-take byte-blob-drop byte-blob-span byte-blob-map byte-blob-fold-left byte-blob-fold-right byte-blob-find byte-blob->list byte-blob->string byte-blob-read byte-blob-write u8vector->byte-blob s8vector->byte-blob u16vector->byte-blob s16vector->byte-blob u32vector->byte-blob s32vector->byte-blob f32vector->byte-blob f64vector->byte-blob byte-blob->u8vector byte-blob->s8vector byte-blob->u16vector byte-blob->s16vector byte-blob->u32vector byte-blob->s32vector byte-blob->f32vector byte-blob->f64vector ) (import scheme (chicken base) (chicken foreign) (chicken blob) (chicken file posix) (chicken memory) srfi-1) (define-record-type byte-blob (make-byte-blob object offset length ) byte-blob? (object byte-blob-object ) (offset byte-blob-offset ) (length byte-blob-length ) ) (define byte-blob->blob byte-blob-object) (define (blob->byte-blob b) (and (blob? b) (make-byte-blob b 0 (blob-size b)))) (define (byte-blob-empty) (make-byte-blob (make-blob 0) 0 0)) (define (byte-blob-empty? b) (zero? (byte-blob-length b))) (define (byte-blob-copy b #!optional (offset (byte-blob-offset b)) (length (byte-blob-length b))) (assert (and (or (positive? offset) (zero? offset)) (or (positive? length) (zero? length)) (>= (- (blob-size (byte-blob-object b)) offset) length))) (make-byte-blob (byte-blob-object b) offset length )) (define blob-set! (foreign-lambda* void ((nonnull-blob b) (integer offset) (byte value)) #<byte-blob lst) (let* ((len (length lst)) (ob (make-blob len))) (let loop ((lst lst) (i 0)) (if (null? lst) (make-byte-blob ob 0 len) (begin (blob-set! ob i (car lst)) (loop (cdr lst) (+ i 1))))))) (define (string->byte-blob str) (make-byte-blob (string->blob str) 0 (string-length str))) (define blob-fill (foreign-lambda* void ((nonnull-blob b) (unsigned-int n) (integer offset) (byte value)) #<=0; i++,p--) { b1[p] = b[i]; } C_return (C_SCHEME_UNDEFINED); END )) (define (byte-blob-reverse b) (let* ((blen (byte-blob-length b)) (ob (byte-blob-object b)) (ob1 (make-blob blen))) (blob-reverse ob ob1 (byte-blob-offset b) blen) (make-byte-blob ob1 0 blen))) (define blob-intersperse (foreign-lambda* void ((nonnull-blob b) (nonnull-blob b1) (byte sep) (integer offset) (integer size)) #<0; i++,p+=2,n--) { b1[p] = sep; b1[p+1] = b[i]; } C_return (C_SCHEME_UNDEFINED); END )) (define (byte-blob-intersperse b x) (let ((blen (byte-blob-length b))) (if (<= blen 1) b (let* ((ob (byte-blob-object b)) (b1len (- (* 2 blen) 1)) (ob1 (make-blob b1len))) (blob-intersperse ob ob1 x (byte-blob-offset b) blen ) (make-byte-blob ob1 0 b1len))))) (define (byte-blob-take b n) (assert (positive? n)) (let ((blen (byte-blob-length b))) (if (< blen n) b (let* ((ob (byte-blob-object b)) (ob1 (make-blob n))) (move-memory! ob ob1 n (byte-blob-offset b) 0) (make-byte-blob ob1 0 n))))) (define (byte-blob-drop b n) (if (zero? n) b (let ((blen (byte-blob-length b))) (assert (and (positive? n) (<= n blen))) (byte-blob-copy b (+ n (byte-blob-offset b)) (- blen n))))) (define (byte-blob-span b start end) (assert (and (or (zero? start) (positive? start)) (positive? end) (< start end))) (byte-blob-take (byte-blob-drop b start) (- end start))) (define (byte-blob-map f b) (let* ((blen (byte-blob-length b)) (ob (byte-blob-object b)) (ob1 (make-blob blen))) (let loop ((i blen) (p (+ blen (byte-blob-offset b)))) (if (positive? i) (let ((p (- p 1))) (blob-set! ob1 p (f (blob-ref ob p))) (loop (- i 1) p)) (make-byte-blob ob1 0 blen))))) (define (byte-blob-fold-right f init b) (let* ((blen (byte-blob-length b)) (ob (byte-blob-object b))) (let loop ((i blen) (p (+ blen (byte-blob-offset b))) (ax init)) (if (positive? i) (let ((p (- p 1))) (loop (- i 1) p (f (blob-ref ob p) ax))) ax)))) (define (byte-blob-fold-left f init b) (let* ((blen (byte-blob-length b)) (ob (byte-blob-object b))) (let loop ((i blen) (p (byte-blob-offset b)) (ax init)) (if (positive? i) (loop (- i 1) (+ 1 p) (f (blob-ref ob p) ax)) ax)))) (define (byte-blob->list b . rest) (let-optionals rest ((fmap identity)) (let loop ((b b) (ax '())) (cond ((byte-blob-empty? b) (reverse ax)) (else (loop (byte-blob-cdr b) (cons (fmap (byte-blob-car b)) ax))))))) (define (byte-blob->string b) (blob->string (byte-blob-object b))) (define (byte-blob->string b) (assert (byte-blob? b)) (let* ([n (byte-blob-length b)] [s (make-string n)] ) (move-memory! (byte-blob-object b) s n (byte-blob-offset b) 0) s)) ;; The following three functions are borrowed from the ;; Chicken-specific parts of SWIG #> static void chicken_Panic (C_char *) C_noret; static void chicken_Panic (C_char *msg) { C_word *a = C_alloc (C_SIZEOF_STRING (strlen (msg))); C_word scmmsg = C_string2 (&a, msg); C_halt (scmmsg); exit (5); /* should never get here */ } static void chicken_ThrowException(C_word value) C_noret; static void chicken_ThrowException(C_word value) { char *aborthook = C_text("\003sysabort"); C_word *a = C_alloc(C_SIZEOF_STRING(strlen(aborthook))); C_word abort = C_intern2(&a, aborthook); abort = C_block_item(abort, 0); if (C_immediatep(abort)) chicken_Panic(C_text("`##sys#abort' is not defined")); #if defined(C_BINARY_VERSION) && (C_BINARY_VERSION >= 8) C_word rval[3] = { abort, C_SCHEME_UNDEFINED, value }; C_do_apply(3, rval); #else C_save(value); C_do_apply(1, abort, C_SCHEME_UNDEFINED); #endif } void chicken_io_exception (int code, int msglen, const char *msg) { C_word *a; C_word scmmsg; C_word list; a = C_alloc (C_SIZEOF_STRING (msglen) + C_SIZEOF_LIST(2)); scmmsg = C_string2 (&a, (char *) msg); list = C_list(&a, 2, C_fix(code), scmmsg); chicken_ThrowException(list); } <# (define blob-read (foreign-lambda* int ((integer fd) (nonnull-blob b) (integer n) ) #<fileno port) ob n))) (if (positive? s) (make-byte-blob ob 0 s) #!eof)))) (define (file->byte-blob filename #!optional mode) (let ((filesize (file-size filename))) (if mode (call-with-input-file filename (lambda (port) (byte-blob-read port filesize)) mode) (call-with-input-file filename (lambda (port) (byte-blob-read port filesize))) )) ) (define blob-write (foreign-lambda* void ((integer fd) (nonnull-blob b) (integer size) (integer offset)) #<fileno port) ob n offset))) ;; code borrowed from srfi-4.scm: (define (pack-copy tag loc) (lambda (v) (##sys#check-structure v tag loc) (let* ((old (##sys#slot v 1)) (n (##sys#size old)) (new (##sys#make-blob n))) (move-memory! old new) (make-byte-blob new 0 n) ))) (define u8vector->byte-blob (pack-copy 'u8vector 'u8vector->byte-blob)) (define s8vector->byte-blob (pack-copy 's8vector 's8vector->byte-blob)) (define u16vector->byte-blob (pack-copy 'u16vector 'u16vector->byte-blob)) (define s16vector->byte-blob (pack-copy 's16vector 's16vector->byte-blob)) (define u32vector->byte-blob (pack-copy 'u32vector 'u32vector->byte-blob)) (define s32vector->byte-blob (pack-copy 's32vector 's32vector->byte-blob)) (define f32vector->byte-blob (pack-copy 'f32vector 'f32vector->byte-blob)) (define f64vector->byte-blob (pack-copy 'f64vector 'f64vector->byte-blob)) (define (unpack-copy tag sz loc) (lambda (bb) (let ((str (byte-blob-object bb)) (offset (byte-blob-offset bb))) (##sys#check-byte-vector str loc) (let* ((len (byte-blob-length bb)) (new (##sys#make-blob len))) (if (or (eq? #t sz) (eq? 0 (##core#inline "C_fixnum_modulo" len sz))) (begin (move-memory! str new len offset) (##sys#make-structure tag new)) (##sys#error loc "blob does not have correct size for packing" tag len sz) ) ) ) )) (define byte-blob->u8vector (unpack-copy 'u8vector #t 'byte-blob->u8vector)) (define byte-blob->s8vector (unpack-copy 's8vector #t 'byte-blob->s8vector)) (define byte-blob->u16vector (unpack-copy 'u16vector 2 'byte-blob->u16vector)) (define byte-blob->s16vector (unpack-copy 's16vector 2 'byte-blob->s16vector)) (define byte-blob->u32vector (unpack-copy 'u32vector 4 'byte-blob->u32vector)) (define byte-blob->s32vector (unpack-copy 's32vector 4 'byte-blob->s32vector)) (define byte-blob->f32vector (unpack-copy 'f32vector 4 'byte-blob->f32vector)) (define byte-blob->f64vector (unpack-copy 'f64vector 8 'byte-blob->f64vector)) ;; ;; ;; Fast sub-sequence search, based on work by Boyer, Moore, Horspool, ;; Sunday, and Lundh. ;; ;; Based on code from the Haskell text library by Tom Harper and Bryan ;; O'Sullivan. http://hackage.haskell.org/package/text ;; ;; ;; References: ;; ;; * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm. ;; Communications of the ACM, 20, 10, 762-772 (1977) ;; ;; * R. N. Horspool: Practical Fast Searching in Strings. Software - ;; Practice and Experience 10, 501-506 (1980) ;; ;; * D. M. Sunday: A Very Fast Substring Search Algorithm. ;; Communications of the ACM, 33, 8, 132-142 (1990) ;; ;; * F. Lundh: The Fast Search Algorithm. ;; (2006) ;; ;; From http://effbot.org/zone/stringlib.htm: ;; ;; When designing the new algorithm, I used the following constraints: ;; ;; * should be faster than the current brute-force algorithm for ;; all test cases (based on real-life code), including Jim ;; Hugunin’s worst-case test ;; ;; * small setup overhead; no dynamic allocation in the fast path ;; (O(m) for speed, O(1) for storage) ;; ;; * sublinear search behaviour in good cases (O(n/m)) ;; ;; * no worse than the current algorithm in worst case (O(nm)) ;; ;; * should work well for both 8-bit strings and 16-bit or 32-bit ;; Unicode strings (no O(σ) dependencies) ;; ;; * many real-life searches should be good, very few should be ;; worst case ;; ;; * reasonably simple implementation ;; ;; This rules out most standard algorithms (Knuth-Morris-Pratt is not ;; sublinear, Boyer-Moore needs tables that depend on both the ;; alphabet size and the pattern size, most Boyer-Moore variants need ;; tables that depend on the pattern size, etc.). ;; ;; After some tweaking, I came up with a simplication of Boyer-Moore, ;; incorporating ideas from Horspool and Sunday. Here’s an outline: ;; ;; def find(s, p): ;; # find first occurrence of p in s ;; n = len(s) ;; m = len(p) ;; skip = delta1(p)[p[m-1]] ;; i = 0 ;; while i <= n-m: ;; if s[i+m-1] == p[m-1]: # (boyer-moore) ;; # potential match ;; if s[i:i+m-1] == p[:m-1]: ;; return i ;; if s[i+m] not in p: ;; i = i + m + 1 # (sunday) ;; else: ;; i = i + skip # (horspool) ;; else: ;; # skip ;; if s[i+m] not in p: ;; i = i + m + 1 # (sunday) ;; else: ;; i = i + 1 ;; return -1 # not found ;; ;; The delta1(p)[p[m-1]] value is simply the Boyer-Moore delta1 (or ;; bad-character skip) value for the last character in the pattern. ;; ;; For the s[i+m] not in p test, I use a 32-bit bitmask, using the 5 ;; least significant bits of the character as the key. This could be ;; described as a simple Bloom filter. ;; ;; Note that the above Python code may access s[n], which would result in ;; an IndexError exception. For the CPython implementation, this is not ;; really a problem, since CPython adds trailing NULL entries to both ;; 8-bit and Unicode strings. ;; ;; /O(n+m)/ Find the offsets of all non-overlapping indices of ;; needle within haystack. ;; ;; In (unlikely) bad cases, this algorithm's complexity degrades ;; towards /O(n*m)/. ;; (define swizzle (foreign-lambda* unsigned-int ((unsigned-int k)) #<> (8*w)); END )) (define bitset? (foreign-lambda* bool ((blob m) (unsigned-int i)) #<> (8*w)); END )) (define (make-table nlast nindex nlen z) (lambda (i msk skp) (let loop ((i i) (msk msk) (skp skp)) (cond ((>= i nlast) (begin (setbit! msk (swizzle z)) (values msk skp))) (else (let* ((c (nindex i)) (skp1 (cond ((= c z) (- nlen i 2)) (else skp)))) (setbit! msk (swizzle c)) (loop (+ 1 i) msk skp1))) )) )) (define (scan1 hindex hlen c) (let loop ((i 0) (ax '())) (cond ((>= i hlen) (reverse ax)) ((= (hindex i) c) (loop (+ 1 i) (cons i ax))) (else (loop (+ 1 i) ax))))) (define (scan nindex hindex nlast nlen ldiff z mask skip i) (define (candidate-match i j) (cond ((>= j nlast) #t) ((not (= (hindex (+ i j)) (nindex j))) #f) (else (candidate-match i (+ 1 j))))) (let loop ((i i) (ax '())) (if (> i ldiff) (reverse ax) (let ((c (hindex (+ i nlast)))) (cond ;; ((and (= c z) (candidate-match i 0)) (loop (+ i nlen) (cons i ax))) ;; (else (let* ((next-in-pattern? (not (bitset? mask (swizzle (hindex (+ i nlen)))))) (delta (cond (next-in-pattern? (+ 1 nlen)) ((= c z) (+ 1 skip)) (else 1)))) (loop (+ i delta) ax)))))))) (define (subsequence-search needle haystack) (let ((nobj (byte-blob-object needle)) (noff (byte-blob-offset needle)) (nlen (byte-blob-length needle)) (hobj (byte-blob-object haystack)) (hoff (byte-blob-offset haystack)) (hlen (byte-blob-length haystack))) (let* ((nindex (lambda (k) (blob-ref nobj (+ noff k)))) (hindex (lambda (k) (blob-ref hobj (+ hoff k)))) (ldiff (- hlen nlen)) (nlast (- nlen 1)) (z (nindex nlast)) (tbl (make-table nlast nindex nlen z)) (m (make-blob 4)) ) (initmask m) (let-values (((mask skip) (tbl 0 m (- nlen 2)))) (cond ((= 1 nlen) (scan1 hindex hlen (nindex 0))) ((or (<= nlen 0) (negative? ldiff)) '()) (else (scan nindex hindex nlast nlen ldiff z mask skip 0))))))) ;; ;; Based on code from the Haskell text library by Tom Harper and Bryan ;; O'Sullivan. http://hackage.haskell.org/package/text ;; ;; /O(n+m)/ Find all non-overlapping instances of needle in ;; haystack. The first element of the returned pair is the prefix ;; of haystack prior to any matches of needle. The second is a ;; list of pairs. ;; ;; The first element of each pair in the list is a span from the ;; beginning of a match to the beginning of the next match, while the ;; second is a span from the beginning of the match to the end of the ;; input. ;; ;; Examples: ;; ;; > find "::" "" ;; > ==> ("", []) ;; > find "/" "a/b/c/d" ;; > ==> ("a", [("/b","/b/c/d"), ("/c","/c/d"), ("/d","/d")]) ;; ;; In (unlikely) bad cases, this function's time complexity degrades ;; towards /O(n*m)/. ;; find :: Text * Text -> (Text, [(Text, Text)]) (define (byte-blob-find needle haystack) (cond ((byte-blob-empty? needle) (error 'find "empty pattern" needle)) (else (let ((r (subsequence-search needle haystack))) (cond ((null? r) (list haystack '())) (else (let* ((hoff (byte-blob-offset haystack)) (hlen (byte-blob-length haystack)) (chunk (lambda (n l) (byte-blob-copy haystack (+ hoff n) l))) (go (lambda (s xs) (let loop ((s s) (xs xs) (ax '())) (if (null? xs) (let ((c (chunk s (- hlen s)))) (reverse (cons (list c c) ax))) (let ((x (car xs)) (xs (cdr xs))) (loop x xs (cons (list (chunk s (- x s)) (chunk s (- hlen s))) ax))))))) ) (list (chunk 0 (car r)) (go (car r) (cdr r))))))) ))) )