;;>| Replacement Parser ;;> ;;> Edward [parser combinators][edward parse] for parsing replacement strings. ;;> ;;> [edward parse]: edward.parse.html (define parse-backref (parse-map (parse-seq (parse-char #\\) parse-digits) (lambda (lst) (cons 'backref (cadr lst))))) (define parse-matched (parse-map (parse-char #\&) (lambda (ch) (cons 'matched ch)))) (define (parse-restr delim) (define replace-ctrl (char-set-adjoin (char-set #\\ #\& #\newline) delim)) (parse-map (parse-as-string (parse-repeat+ (parse-or ;; special handling for '%' as it does not neccessarily ;; need to be escaped unless it's the only character. (parse-esc (parse-char (char-set-adjoin replace-ctrl #\%))) (parse-not-char replace-ctrl)))) (lambda (str) (cons 'restr str)))) ;;> Parse a replacement string within text enclosed with the delimiter ;;> `delim`. While the combinator does not parse the enclosed character, ;;> it ensures that this `delim` character is escaped (using a `\`) ;;> within the replacement. ;;> ;;> Refer to the documentation of the [ed substitute][ed substitute] ;;> command for more information on special character support within ;;> the replacement. All of these special characters can also be ;;> escaped. ;;> ;;> [ed substitute]: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/ed.html#tag_20_38_13_25 (define (parse-replace delim) (parse-map (parse-repeat (parse-atomic (parse-or parse-backref parse-matched (parse-restr delim)))) (lambda (lst) ;; If the replacement is empty replace matched text with an empty string. (if (null? lst) (cons '(restr . "") lst) lst)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;>| Replacement Procedures ;;> ;;> Procedures for performing replacements using a parsed replacement string. (define (submatch subm bv n) (if (>= n (vector-length subm)) (string->utf8 (number->string n)) ;; XXX: Handling for invalid backreferences (let ((match (vector-ref subm n))) (if match (bytevector-copy bv (car match) (cdr match)) #u8())))) (define (regex-replace* regex subst bv nth) (define (apply-replacement subm bv replacement) (fold (lambda (x y) (bytevector-append y (match x (('restr . s) (string->utf8 s)) (('matched . _) (submatch subm bv 0)) (('backref . n) (submatch subm bv n))))) #u8() replacement)) ;; TODO: Refactor this function and make it more readable. ;; Also don't rely on (values …) truncation (not in R7RS). (define (%regex-replace* re start n) (let* ((v (bytevector-copy bv start)) (subm (regex-exec regex v))) (if subm (let* ((m (vector-ref subm 0)) ;; submatch for entire regex (s (car m)) ;; start of submatch (e (cdr m)) ;; end of submatch (i (+ start e)) ;; next index in bv (r (delay (bytevector-append (bytevector-copy v 0 s) (apply-replacement subm v re))))) (values (if (eqv? n nth) (bytevector-append (force r) (bytevector-copy bv i)) (bytevector-append (if (zero? nth) (force r) (bytevector-copy v 0 e)) (%regex-replace* re i (+ n 1)))) #t)) (values v #f)))) (%regex-replace* subst 0 1)) ;;> Replace `nth` occurrence of `regex` in `str` with `subst`. If `nth` ;;> is zero, all occurrences are replaced. Returns two results: The string ;;> after performing all replacement and a boolean indicating if any ;;> replacements were successfully performed. The `regex` must be ;;> created using [make-regex][make-regex], while the replacement string ;;> `subst` must be parsed using [parse-replace][parse-replace]. ;;> ;;> [make-regex]: https://wiki.call-cc.org/eggref/5/posix-regex#make-regex ;;> [parse-replace]: #parse-replace (define (regex-replace regex subst str nth) ;; regexec(3p) offsets are byte, not character offsets. ;; Thus, the string needs to be converted to a bytevector. (let-values (((result modified) (regex-replace* regex subst (string->utf8 str) nth))) (values (utf8->string result) modified)))