;;;; charconv.scm -- encoding utils ;; ;; Copyright (c) 2004-2009 Alex Shinn ;; All rights reserved. ;; ;; BSD-style license: http://www.debian.org/misc/bsd.license ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This module provides a convenience layer over top of the iconv ;; module, as well as automatic detection of character encoding schemes. ;; It implicitly assumes you are using UTF8 internally for your strings ;; (you can use the 'utf8 module to change string semantics to use UTF8 ;; as well). Given that, all you need to do is specify the external ;; encoding you are working with. ;; ;; INPUT/OUTPUT PROCEDURES: ;; ;; The following are direct analogs of the equivalent R5RS procedures: ;; ;; - open-encoded-input-file FILE ENC ;; - call-with-encoded-input-file FILE ENC PROC ;; - with-input-from-encoded-file FILE ENC THUNK ;; - open-encoded-output-file FILE ENC ;; - call-with-encoded-output-file FILE ENC PROC ;; - with-output-to-encoded-file FILE ENC THUNK ;; ;; Example: ;; ;; (with-input-from-encoded-file "/usr/share/edict/edict" "EUC-JP" ;; read-line) ;; ;; - read-encoded-string ENC [N [PORT]] ;; ;; An anolog of read-string using byte-count (not character count). ;; May read additional bytes to ensure you read along a character ;; boundary. If you really want exactly N bytes regardless of ;; character boundaries, you should combine read-string with ;; ces-convert below. ;; ;; UTILITY PROCEDURES: ;; ;; The following are copied from the Gauche API. CES stands for ;; Character Encoding Scheme. ;; ;; - ces-equivalent? CES-A CES-B ;; ;; Returns #t if CES-A and CES-B are equivalent (aliases), #f otherwise. ;; ;; - ces-upper-compatible? CES-A CES-B ;; ;; Returns #t if a string encoded in CES-B can be considered a string ;; in CES-A without conversion. ;; ;; - ces-convert STR FROM [TO] ;; ;; Return a new string of STR converted from encoding FROM to encoding ;; TO. ;; ;; DETECTION PROCEDURES: ;; ;; - detect-file-encoding FILE [LOCALE] ;; - detect-encoding STRING [LOCALE] ;; ;; The detection procedures can correctly identify most common 'types' ;; of encodings, such as UTF-8/16/32, EUC-*, ISO-2022-*, Shift_JIS or ;; single-byte, without any need for specifying the locale. However, ;; currently it doesn't include any statistical or linguistic routines, ;; without which it can't distinguish between EUC-JP and EUC-KR, or ;; between any of the single-byte encodings (including ISO-8859-*). In ;; these cases you can specify a locale, such that in the event of a ;; single-byte encoding a "de" locale would result in the default ;; German single-byte encoding, ISO-8859-1. ;; ;; The detect-file-encoding procedure also recognizes the Emacs-style ;; ;; -*- coding: foo -*- ;; ;; signature in either of the first two lines. ;; ;; ;; AUTOMATIC DETECTION: ;; ;; You can also use the automatic detection implicitly in the input ;; procedures by specifying an encoding of "*" or "*". For ;; example, ;; ;; (open-encoded-input-file file "*") ; guess with no locale ;; (open-encoded-input-file file "*DE") ; guess with a German locale ;; ;; For compatibility with the Gauche convention, the encoding "*JP" ;; is equivalent to "*JA", the Japanese locale. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (require-library regex iconv) (module charconv ( make-encoded-input-port make-encoded-output-port open-encoded-input-file open-encoded-output-file with-input-from-encoded-file with-output-to-encoded-file call-with-encoded-input-file call-with-encoded-output-file detect-encoding detect-file-encoding ces-equivalent? ces-upper-compatible? ces-convert #;ces-converted-length read-encoded-string ) (import scheme chicken extras regex ports posix srfi-69 iconv) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; high-level interface (define (open-encoded-input-file file enc) (if (and enc (eqv? (string-ref enc 0) #\*)) (let* ((lang1 (and (>= (string-length enc) 3) (string (char-downcase (string-ref enc 1)) (char-downcase (string-ref enc 2))))) (lang (if (equal? lang1 "jp") "ja" lang1))) (make-encoded-input-port (open-input-file file) (or (detect-file-encoding file lang) "UTF8"))) (make-encoded-input-port (open-input-file file) enc))) (define (with-input-from-encoded-file file enc thunk) (let* ((in (open-encoded-input-file file enc)) (res (with-input-from-port in thunk))) (close-input-port in) res)) (define (call-with-encoded-input-file file enc proc) (let* ((in (open-encoded-input-file file enc)) (res (proc in))) (close-input-port in) res)) (define (open-encoded-output-file file enc) (make-encoded-output-port (open-output-file file) enc)) (define (with-output-to-encoded-file file enc thunk) (call-with-output-file file (lambda (out) (with-output-to-port (make-encoded-output-port out enc) thunk)))) (define (call-with-encoded-output-file file enc proc) (call-with-output-file file (lambda (out) (proc (make-encoded-output-port out enc))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; low-level encoding ports ;; Padding ;; We read a block of text at a time and feed it to iconv, but for ;; variable-byte encodings such as sjis and euc the block isn't ;; guaranteed to be on a character boundary, so we check and read ;; extra characters until we're sure we're on a valid boundary. This ;; is potentially very slow in pathological cases (in practice we'll ;; usually hit a newline if nothing else) so it should be replaced by ;; internal state buffering in the iconv routines. (define (pad-sjis-input str in) (define (starter? ch) (or (<= #x80 (char->integer ch) #x9F) (<= #xE0 (char->integer ch) #xFC))) (let ((len (string-length str))) (if (or (zero? len) (not (starter? (string-ref str (- len 1))))) str (let ((rest (let lp ((acc '())) (let ((ch (read-char in))) (cond ((eof-object? ch) acc) ((not (starter? ch)) (cons ch acc)) (else (lp (cons ch acc)))))))) (if (pair? rest) (string-append str (list->string (reverse))) str))))) (define (pad-euc-input str in) (define (multi? ch) (<= #xA0 (char->integer ch) #xFE)) (let ((len (string-length str))) (if (or (zero? len) (<= (char->integer (string-ref str (- len 1))) #x7F)) str (let ((rest (let lp ((acc '())) (let ((ch (read-char in))) (cond ((eof-object? ch) acc) ((<= #x8E (char->integer ch) #x8F) (let* ((ch2 (read-char in)) (ch3 (read-char in))) (if (eof-object? ch3) (cons ch acc) (cons ch3 (cons ch2 (cons ch acc)))))) ((<= (char->integer ch) #x7F) (cons ch acc)) (else (lp (cons ch acc)))))))) (if (pair? rest) (string-append str (list->string (reverse rest))) str))))) (define (encoded-input-port-padder enc) (cond ((string-ci=? enc "SHIFT_JIS") pad-sjis-input) ((string-ci=? enc "EUC-JP") pad-euc-input) (else #f))) (define (make-encoded-input-port in enc) (if (or (not enc) (string-ci=? enc "UTF8") (string-ci=? enc "ASCII")) in (let ((cd (iconv-open "UTF8" enc)) (buf "") (pad (or (encoded-input-port-padder enc) (lambda (str in) str))) (size 0) (off 0)) (if (not cd) (error "unknown encoding" enc) (make-input-port (lambda () (if (>= off size) (let ((str (pad (read-string 1024 in) in))) (set! buf (iconv cd str)) (set! size (string-length buf)) (set! off 0))) (if (>= off size) #!eof (let ((ch (string-ref buf off))) (set! off (+ off 1)) ch))) (lambda () (or (< off size) (char-ready? in))) (lambda () (close-input-port in))))))) (define (make-encoded-output-port out enc) (if (or (not enc) (string-ci=? enc "UTF8")) out (let ((cd (iconv-open enc "UTF8"))) (if (not cd) (error "unknown encoding" enc) (make-output-port (lambda (str) (display (iconv cd str) out)) (lambda () (close-output-port out)) (lambda () (flush-output out))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; utilities (define (string-upcase str) (let* ((len (string-length str)) (str2 (make-string len))) (do ((i 0 (+ i 1))) ((= i len) str2) (string-set! str2 i (char-upcase (string-ref str i)))))) ;; this should match iconv (or whatever the backend converter is) as ;; closely as possible (define ces-normalize-name (let ((aliases (make-hash-table string=?))) (for-each (lambda (ls) (for-each (cute hash-table-set! aliases <> (car ls)) (cdr ls))) '(("UTF8" "UTF-8") ("ISO-8859-1" "LATIN-1") ("SHIFT_JIS" "SJIS" "SHIFTJIS" "SHIFT-JIS") ("EUC-JP" "EUCJP" "EUC_JP") )) (lambda (str) (let ((str2 (string-upcase str))) (hash-table-ref/default aliases str2 str2))))) (define (%ces-upper-compatible? a b) (cond ((not b) #t) ((string=? b "UTF8") (string=? a "UTF8")) ((string=? b "ASCII") (not (member a '("UTF16" "UTF32")))) (else #f))) (define (ces-upper-compatible? a b) (%ces-upper-compatible? (ces-normalize-name a) (ces-normalize-name b))) (define (ces-equivalent? a b) (string=? (ces-normalize-name a) (ces-normalize-name b))) (define (ces-convert str from . o) (let ((to (or (and (pair? o) (car o)) "UTF8"))) (if (ces-upper-compatible? to from) str (let ((cd (iconv-open to from))) (if (not cd) (error "ces-convert: unsupported conversion" to from) (iconv cd str)))))) (define (read-encoded-string enc1 . o) (let-optionals* o ((n #f) (port (current-input-port))) (let ((enc (ces-normalize-name enc1))) (if n (let* ((padder (encoded-input-port-padder enc)) (str1 (read-string n port)) (str (if padder (padder str1 port) str1))) (if enc (ces-convert str enc "UTF8") str)) (ces-convert (read-string #f port) enc1 "UTF8"))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; automatic encoding detection ;; When should we use ISO-8859-15? (define *8-bit-encodings* '(("af" . "ISO-8859-1") ; Afrikans ("ca" . "ISO-8859-1") ; Catalan ("da" . "ISO-8859-1") ; Danish ("de" . "ISO-8859-1") ; German ("en" . "ISO-8859-1") ; English ("eu" . "ISO-8859-1") ; Basque ("fi" . "ISO-8859-1") ; Finish ("fr" . "ISO-8859-1") ; French ("fo" . "ISO-8859-1") ; Faroese ("ga" . "ISO-8859-1") ; Irish ("gd" . "ISO-8859-1") ; Scottish ("it" . "ISO-8859-1") ; Italian ("is" . "ISO-8859-1") ; Icelandic ("nl" . "ISO-8859-1") ; Dutch ("no" . "ISO-8859-1") ; Norwegian ("pt" . "ISO-8859-1") ; Portugese ("rm" . "ISO-8859-1") ; Rhaeto-Romanic ("sq" . "ISO-8859-1") ; Albanian ("sw" . "ISO-8859-1") ; Swahili ("cs" . "ISO-8859-2") ; Czech ("hu" . "ISO-8859-2") ; Hungarian ("pl" . "ISO-8859-2") ; Polish ("ro" . "ISO-8859-2") ; Romanian ("hr" . "ISO-8859-2") ; Hungarian ("sk" . "ISO-8859-2") ; Slovak ("sl" . "ISO-8859-2") ; Slovenian ("eo" . "ISO-8859-3") ; Esperanto ("mt" . "ISO-8859-3") ; Maltese ("et" . "ISO-8859-4") ; Estonian ("lv" . "ISO-8859-4") ; Latvian ("lt" . "ISO-8859-4") ; Lithuanian ("kl" . "ISO-8859-4") ; Greenlandic ("bg" . "ISO-8859-5") ; Bulgarian ("be" . "ISO-8859-5") ; Byelorussian ("mk" . "ISO-8859-5") ; Macedonian ("ru" . "ISO-8859-5") ; Russian ("sr" . "ISO-8859-5") ; Serbian ("uk" . "ISO-8859-5") ; Ukranian ("ar" . "ISO-8859-6") ; Arabic ("fa" . "ISO-8859-6") ; Persian ("ur" . "ISO-8859-6") ; Urdu ("el" . "ISO-8859-7") ; Greek ("iw" . "ISO-8859-8") ; Hebrew ("ji" . "ISO-8859-8") ; Yiddish ("tr" . "ISO-8859-9") ; Turkish ("th" . "TIS620") ; Thai ("vi" . "VISCII") ; Vietnamese ("cy" . "ISO-8859-14") ; Welsh )) (define *euc-encodings* '(("zh" . "GB2312") ; Chinese ("ja" . "EUC-JP") ; Japanese ("kr" . "EUC-KR") ; Korean )) (define *shift-encodings* '(("ja" . "SHIFT_JIS") ; Japanese )) (define *iso-encodings* '(("ja" . "ISO-2022-JP") ; Japanese ("kr" . "ISO-2022-KR") ; Korean )) (define (detect-8-bit-encoding str) "ISO-8859-1") (define (detect-euc-encoding str) "EUC-JP") (define (detect-shift-encoding str) "SHIFT_JIS") (define (detect-iso-encoding str) "ISO-2022-JP") (define (detect-locale-encoding str enc-type lang) (define (string-downcase! s) (do ((i (- (string-length s) 1) (- i 1))) ((< i 0) s) (let ((c (char->integer (string-ref s i)))) (if (<= 65 c 90) (string-set! s i (integer->char (+ c 32))))))) (define (string-index s c) (let ((limit (string-length s))) (let lp ((i 0)) (cond ((= i limit) #f) ((eqv? (string-ref s i) c) i) (else (lp (+ i 1))))))) (define (normalize1 lang) (and lang (cond ((string-index lang #\.) => (lambda (i) (substring lang 0 i))) (else lang)))) (define (normalize2 lang) (and lang (cond ((string-index lang #\_) => (lambda (i) (substring lang 0 i))) (else lang)))) (let ((lang (and lang (string-downcase! lang)))) (case enc-type ((BINARY) #f) ((7-BIT) "ASCII") ((8-BIT) (cond ((assoc lang *8-bit-encodings*) => cdr) ((assoc (normalize1 lang) *8-bit-encodings*) => cdr) ((assoc (normalize2 lang) *8-bit-encodings*) => cdr) (else (detect-8-bit-encoding str)))) ((EUC) (cond ((assoc lang *euc-encodings*) => cdr) ((assoc (normalize1 lang) *euc-encodings*) => cdr) ((assoc (normalize2 lang) *euc-encodings*) => cdr) (else (detect-euc-encoding str)))) ((SHIFT) (cond ((assoc lang *shift-encodings*) => cdr) ((assoc (normalize1 lang) *shift-encodings*) => cdr) ((assoc (normalize2 lang) *shift-encodings*) => cdr) (else (detect-shift-encoding str)))) ((ISO-2022) (cond ((assoc lang *iso-encodings*) => cdr) ((assoc (normalize1 lang) *iso-encodings*) => cdr) ((assoc (normalize2 lang) *iso-encodings*) => cdr) (else (detect-iso-encoding str)))) ((UTF-8 UTF-16LE UTF-16BE UTF-32LE UTF-32BE) (symbol->string enc-type)) ((UTF-16 UTF-32) (string-append (symbol->string enc-type) (if (memv (machine-type) '(x86 x86-64)) "LE" "BE"))) (else (error "unknown encoding type" enc-type))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; encoding types ( == 0x1B) ;; ;; ASCII: absence of zeros or high-bit characters, no locale ;; UTF-7: can only be distinguished from ASCII statistically ;; UTF-8: absence of malformed UTF-8 (common in other encodings), no locale ;; UTF-16/32-le/be: easily determined by presence of zeros, no locale ;; ISO-2022: use of escape sequences, locale-dependent ;; -KR: stream starts w/ $)C then 0x0E ...escaped 7-bit... 0x0F ;; -JP: $B ...escaped 7-bit... (J ;; ISO-8859/TIS-620: not ASCII, no byte pattern ;; EUC-CN/KR: ASCII + [A1-FE][A1-FE] ;; EUC-JP: ASCII + 8E[A1-DF] + [A1-FE][A1-FE] + 8F[A1-FE][A1-FE], locale-dependent (CJK) ;; SJIS: ASCII + [A1-DF] + [81-9F,E0-FC][40-7E,80-FC], Japanese ;; BIG5: ASCII + [A1-FE][40-7E,A1-FE], Chinese ;; GB2312 (== EUC-CN?) : ASCII + [A0-F7][A0-FE], Chinese ;; GBK: ASCII + [81-FE][40-7E,80-FE], Chinese ;; HZ (han4zi4) : ASCII + ~{(GB|[B0-F7][A0-FE])...~} ???, Chinese ;; UHC (Unified Hangul Code): ASCII + [81-FE][41-5A,61-7A,81-FE], Korean ;; There is very little overlap in different encoding types, so rather ;; than a data-driven state-machine or statistical methods we just ;; code the states directly, which is fast and uses very little ;; memory, but doesn't help distinguish between the single-byte 8-bit ;; encodings. To address this we'll need to use statistical analysis ;; to detect between languages. Hopefully, though, UTF-8 is replacing ;; most uses of the old 8-bit encodings. (define-syntax make-detect-state-machine (lambda (expr rename compare) (apply (lambda (next limit i c . states) (define (expand-clauses ls) (let lp ((ls ls) (res '())) (if (null? ls) (reverse res) (let ((check (car ls)) (state (caddr ls))) (case (cadr ls) ((-->) (lp (cdddr ls) (cons `(,check ,state) res))) ((->) (if (and (pair? (cdddr ls)) (number? (cadddr ls))) (lp (cddddr ls) (cons `(,check (,state (,(rename '+) ,i ,(cadddr ls)))) res)) (lp (cdddr ls) (cons `(,check (,state (,(rename '+) ,i 1))) res)))) (else (error "invalid state machine: " ls))))))) (define (make-state ls) (let ((name (car ls)) (final (cadr ls))) `(,(rename 'define) (,name ,i) (,(rename 'if) (,(rename '>=) ,i ,limit) ,final (,(rename 'let) ((,c (,next i))) (,(rename 'cond) ,@(expand-clauses (cddr ls)))))))) `(,(rename 'begin) ,@(map make-state states))) (cdr expr)))) (define (detect-encoding-type str) (let ((limit (string-length str)) (maybe-iso? #f)) (define (next i) (char->integer (string-ref str i))) (define (bom-FE i) (if (and (even? i) (or (= (+ i 1) limit) (= (next (+ i 1)) #xFF))) (if (and (< (+ i 3) limit) (zero? (next (+ i 2))) (zero? (next (+ i 3)))) 'UTF-32BE 'UTF-16BE) (euc-2 (+ i 1)))) (define (bom-FF i) (if (and (even? i) (or (>= (+ i 1) limit) (= (next (+ i 1)) #xFE))) (if (and (< (+ i 3) limit) (zero? (next (+ i 2))) (zero? (next (+ i 3)))) 'UTF-32LE 'UTF-16LE) (wide-or-8bit (+ i 1)))) ;; expand the states (make-detect-state-machine next limit i c ;; syntax: ;; state final-result ;; test -> next-state [offset] ;; test --> expr (escape '7-BIT (memv c '(36 40)) --> (begin (set! maybe-iso? #t) (any (+ i 1))) else -> any) (any (if maybe-iso? 'ISO-2022 '7-BIT) (zero? c) -> null (= c #x1B) -> escape (< c #x80) -> any (< c #x8E) -> shift-2 (= c #x8E) -> euc-8E (= c #x8F) -> euc-8F (< c #xA0) -> shift-2 (< c #xC0) -> euc/shift-2 (< c #xD0) -> utf8-2-2nd (< c #xF0) -> utf8-3-2nd (< c #xFD) -> euc/shift-2 (= c #xFD) -> euc-2 (= c #xFE) -> bom-FE 0 else -> bom-FF 0) (shift-2 'SHIFT (or (<= #x40 c #x7E) (<= #x80 c #xFC)) -> shift else -> wide-or-8bit 0) (shift 'SHIFT (zero? c) -> null (or (< c #x80) (<= #xA0 c #xDF)) -> shift (<= #x80 c #xFC) -> shift-2 else -> wide-or-8bit 0) (euc-8E 'EUC (<= #xA1 c #xDF) -> euc/shift else -> shift-2 0) (euc-8F 'EUC (and (<= #xA1 c #xFE) (<= #xA1 (next (+ i 1)) #xFE)) -> euc/shift else -> shift-2 0) (euc/shift 'EUC (zero? c) -> null (< c #x80) -> euc/shift (< c #x8E) -> shift-2 (= c #x8E) -> euc-8E (= c #x8F) -> euc-8F (< c #xA0) -> shift-2 (< c #xC0) -> euc/shift-2 (< c #xFD) -> euc/shift-2 (= c #xFD) -> euc-2 else -> wide-or-8bit 0) (euc/shift-2 'EUC (<= #xA1 c #xFC) -> euc/shift (<= #xFD c #xFE) -> euc (or (<= #x40 c #x7E) (<= #x80 c #xA0)) -> shift else -> wide-or-8bit 0) (euc 'EUC (zero? c) -> null (< c #x80) -> euc (= c #x8E) -> euc-8E (= c #x8F) -> euc-8F (<= #xA0 c #xFE) -> euc-2 else -> wide-or-8bit 0) (euc-2 'EUC (<= #xA1 c #xFE) -> euc else -> wide-or-8bit 0) (utf8 'UTF-8 (zero? c) -> null (< c #x80) -> utf8 (< c #xC0) -> euc/shift 0 (< c #xE0) -> utf8-2-2nd (< c #xF0) -> utf8-3-2nd else -> euc/shift 0) (utf8-2-2nd 'UTF-8 (>= c #x80) -> utf8 else -> euc/shift 0) (wide-or-8bit '8-BIT (zero? c) -> null else -> wide-or-8bit) (wide 'UTF-16 (zero? c) -> null else -> wide) (null 'UTF-16 (zero? c) -> null2 else -> wide) (null2 (if (even? i) 'UTF-32 'UTF-16) (zero? c) -> null3 (and (even? i) (< (+ i 1) limit) (= #xFE c) (= #xFF (next (+ i 1)))) --> 'UTF32BE else -> wide) (utf8-3-2nd 'UTF-8 (>= c #x80) -> utf8-3-3rd else -> euc/shift 0) (utf8-3-3rd 'UTF-8 (>= c #x80) -> utf8 else -> euc/shift -1) (null3 'UTF-32 (zero? c) --> 'BINARY else -> utf32) (utf32 'UTF-32 else --> 'UTF-32) ) ;; could be any to start (any 0))) (define (detect-encoding str . o) (let ((type (detect-encoding-type str))) (detect-locale-encoding str type (if (pair? o) (car o) "*")))) (define detect-file-encoding (let ((rx (regexp "^(?:[^\n]*\n)?[^\n]*-\\*-[^\n]*\\bcoding:\\s*\\b(\\S+)\\b[^\n]*-\\*-" #t)) (cache (make-hash-table string=?))) (lambda (file . o) (let* ((fullname file) (last (hash-table-ref/default cache fullname #f))) (if (and last (>= (cdr last) (file-modification-time fullname))) (car last) (let ((str (with-input-from-file file (cut read-string 1024)))) (let ((res (cond ((string-match rx str) => cadr) (else (apply detect-encoding str o))))) (hash-table-set! cache fullname (cons res (current-seconds))) res))))))) )