;;; Copyright © 2019 Mark H Weaver ;;; ;;; This program is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation, either version 3 of the License, or ;;; (at your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with this program. If not, see . (use-modules (rnrs bytevectors) ;; the following modules are only needed for the test. ;;(srfi srfi-1) ;;(ice-9 iconv) ) ;; Well-formed UTF-8 sequences ;; =========================== ;; 00..7F ;; C2..DF 80..BF ;; E0 *A0..BF 80..BF ;; E1..EC 80..BF 80..BF ;; ED 80..9F* 80..BF ;; EE..EF 80..BF 80..BF ;; F0 *90..BF 80..BF 80..BF ;; F1..F3 80..BF 80..BF 80..BF ;; F4 80..8F* 80..BF 80..BF ;; UTF-8 Decoder states ;; ==================== ;; 0 start state ;; C2 .. DF got 1/2 bytes ;; E0 .. EF got 1/3 bytes ;; F0 .. F4 got 1/4 bytes ;; E0A0 .. ED9F got 2/3 bytes (range 1) ;; EE80 .. EFBF got 2/3 bytes (range 2) ;; F090 .. F48F got 2/4 bytes ;; F09080 .. F48FBF got 3/4 bytes (define-syntax-rule (utf8-decode ((j init-expr) ...) (i continue) (output (code-point) e1 e1* ...) (error (maximal-subpart) e2 e2* ...) state-expr bv-expr start-expr end-expr) ;; The start state is 0. ;; When 'error' is called with arguments (state i j ...), 'state' ;; contains the bytes of the "maximal subpart of an ill-formed ;; subsequence" as defined in The Unicode Standard section 3.9, ;; i.e. the bytes which are being represented by this error report ;; and which are not being converted. 'i' is the bytevector index ;; immediately following this maximal subpart, i.e. the index where ;; decoding should resume. 'j ...' are the user-provided seeds. ;; the decoder returns the values: (new-state bv-pos j ...) (let ((bv bv-expr) (end end-expr)) (define (output code-point i j ...) e1 e1* ...) (define (error maximal-subpart i j ...) e2 e2* ...) (define (continue i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (cond ((<= byte #x7F) (output byte (+ i 1) j ...)) ((<= #xC2 byte #xF4) (got-1 byte (+ i 1) j ...)) (else (error byte (+ i 1) j ...)))) (values 0 i j ...))) (define (got-1 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (cond ((not (<= #x80 byte #xBF)) (error state i j ...)) ((<= state #xDF) (output (logior (ash (logand state #x1F) 6) (logand byte #x3F)) (+ i 1) j ...)) (else (let ((state^ (logior (ash state 8) byte))) (cond ((or (<= #xE0A0 state^ #xED9F) (<= #xEE80 state^ #xEFBF)) (got-2/3 state^ (+ i 1) j ...)) ((<= #xF090 state^ #xF48F) (got-2/4 state^ (+ i 1) j ...)) (else (error state i j ...))))))) (values state i j ...))) (define (got-2/3 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (output (logior (ash (logand state #xF00) 4) (ash (logand state #x3F) 6) (logand byte #x3F)) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (got-2/4 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (got-3/4 (logior (ash state 8) byte) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (got-3/4 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (output (logior (ash (logand state #x70000) 2) (ash (logand state #x3F00) 4) (ash (logand state #x3F) 6) (logand byte #x3F)) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (enter state i j ...) (cond ((zero? state) (continue i j ...)) ((<= state #xF4) (got-1 state i j ...)) ((<= state #xEFBF) (got-2/3 state i j ...)) ((<= state #xF48F) (got-2/4 state i j ...)) (else (got-3/4 state i j ...)))) (enter state-expr start-expr init-expr ...))) ;; Returns three values: (new-state source-pos target-pos) (define (utf8->string! state source source-start source-end target target-start target-end) (utf8-decode ((j target-start)) (i continue) (output (code-point) (string-set! target j (integer->char code-point)) (if (< (+ j 1) target-end) (continue i (+ j 1)) (values 0 i (+ j 1)))) (error (maximal-subpart) (output #xFFFD i j)) ;TODO: support other error handlers state source source-start source-end)) ;; Another experimental primitive, slower than the ones above. (define* (utf8-fold* out err seed state bv #:optional (start 0) (end (bytevector-length bv))) (utf8-decode ((j seed)) (i continue) (output (code-point) (out code-point i j continue)) (error (maximal-subpart) (err maximal-subpart i j continue)) state bv start end)) ;; Another experimental primitive, slower than the ones above. (define* (utf8-fold out err seed state bv #:optional (start 0) (end (bytevector-length bv))) (utf8-fold* (lambda (code-point i j continue) (continue i (out code-point i j))) (lambda (maximal-subpart i j continue) (call-with-values (lambda () (err maximal-subpart i j)) (lambda (continue? j^) (if continue? (continue i j^) (values 0 i j^))))) seed state bv start end)) ;; A not-so-quick test of all valid characters. ;; TODO: Tests of strictness and error handling. #; (let () (define ss (string-tabulate (lambda (i) (if (< i #xD800) (integer->char i) (integer->char (+ i #x800)))) (- #x110000 #x800))) (define bv (string->utf8 ss)) (define bv-len (bytevector-length bv)) (define slen (* 2 (string-length ss))) (define s (make-string slen)) (every (lambda (incr) (string-fill! s #\a) (call-with-values (lambda () (let loop ((state 0) (i 0) (j 0)) (if (< i bv-len) (call-with-values (lambda () (utf8->string! state bv i (min bv-len (+ i incr)) s j slen)) loop) (values state i j)))) (lambda (state i j) (and (= i bv-len) (= j (string-length ss)) (string=? ss (substring s 0 j)))))) (iota 5 1)))