;;; Copyright © 2019 Mark H Weaver ;;; ;;; This program is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation, either version 3 of the License, or ;;; (at your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with this program. If not, see . (use-modules (rnrs bytevectors) ;; the following modules are needed only for the test. ;;(srfi srfi-1) ;;(ice-9 iconv) ) ;; Well-formed UTF-8 sequences ;; =========================== ;; 00..7F ;; C2..DF 80..BF ;; E0 *A0..BF 80..BF ;; E1..EC 80..BF 80..BF ;; ED 80..9F* 80..BF ;; EE..EF 80..BF 80..BF ;; F0 *90..BF 80..BF 80..BF ;; F1..F3 80..BF 80..BF 80..BF ;; F4 80..8F* 80..BF 80..BF ;; UTF-8 Decoder states ;; ==================== ;; 0 start state ;; C2 .. DF got 1/2 bytes ;; E0 .. EF got 1/3 bytes ;; F0 .. F4 got 1/4 bytes ;; E0A0 .. ED9F got 2/3 bytes (range 1) ;; EE80 .. EFBF got 2/3 bytes (range 2) ;; F090 .. F48F got 2/4 bytes ;; F09080 .. F48FBF got 3/4 bytes (define-syntax-rule (utf8-decode ((j init-expr) ...) (i continue) (output (code-point) e1 e1* ...) (error (maximal-subpart) e2 e2* ...) state-expr bv-expr start-expr end-expr) "Let BYTE-STR denote the concatenation of the following two byte strings: (1) the bytes encoded in STATE-EXPR, and (2) the bytevector BV-EXPR beginning with index START-EXPR (inclusive) and ending with index END-EXPR (exclusive). STATE-EXPR must evaluate to an exact integer between 0 and #xF48FBF that encodes a proper prefix of a well-formed UTF-8 sequence. The bytes are in big-endian order, e.g. #xF48FBF encodes (F4 8F BF) and 0 encodes the empty string. Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of a well-formed UTF-8 byte sequence. Let ITEMS be a list of strings whose concatenation equals BYTE-STR with TAIL removed, such that each element in ITEMS is either (1) a well-formed UTF-8 byte sequence, or (2) a maximal subpart of an ill-formed subsequence, as defined in section 3.9 of The Unicode Standard 12.0, i.e. the longest code unit subsequence starting at an inconvertible offset that is either (a) the initial subsequence of a well-formed code unit sequence, or (b) a subsequence of length one. UTF8-DECODE iterates over ITEMS from left to right, evaluating the 'output' expressions (E1 E1* ...) for each well-formed UTF-8 byte sequence, and the 'error' expressions (E2 E2* ...) for each maximal subpart of an ill-formed subsequence. As with a 'fold' operation, zero or more seeds are included in the iterator state, bound to variables (J ...) with initial values (INIT-EXPR ...). Each of the user-provided expression sequences (E1 E1* ...) and (E2 E2* ...) have access to the current seed values (J ...), and produce new seed values each time they are called. The user-provided expression sequences have access to the following variables, whose identifiers are specified by operands to UTF8-DECODE: (1) I, the bytevector index immediately following the current item, i.e. where decoding should resume after processing this item. (2) (J ...) the user-provided seed values. (3) CODE-POINT [only bound in (E1 E1* ...)], an exact integer Unicode scalar value. (4) MAXIMAL-SUBPART [only bound in (E2 E2* ...)], the maximal subpart of the ill-formed subsequence, represented as an exact integer from #xC2 to #xF48FBF containing the bytes in big-endian order. For example, #xF48FBF represents the byte string (F4 8F BF). (5) CONTINUE, a procedure which may be applied to arguments (I J^ ...) by the user-provided expression sequences to continue decoding with the new seed values (J^ ...). (6) OUTPUT, a procedure which may be applied to arguments (CODE-POINT I J^ ...) to evaluate the user-provided expression sequence (E1 E1* ...). (7) ERROR, a procedure which may be applied to arguments (MAXIMAL-SUBPART I J^ ...) to evaluate the user-provided expression sequence (E2 E2* ...). Each user-provided expression sequence may choose either to continue the loop by calling (CONTINUE I J^ ...), or to terminate the loop by returning (values 0 I J^ ...), where (J^ ...) are the new seed values. OUTPUT and ERROR are included for convenience, to allow the output and error expression sequences to call each other. For example, it may be convenient for the error expression sequence to end with: (output #xFFFD i j ...) If the end of BYTE-STR is reached, UTF8-DECODE returns the values (NEW-STATE END J^ ...), where NEW-STATE encodes the bytes in TAIL, END equals END-EXPR, and (J^ ...) are the final seed values. In any case, if the user-provided expressions behave as specified above, then when UTF8-DECODE returns values (NEW-STATE NEW-POS J^ ...), every byte in BYTE-STR will have been reported in exactly one of the following ways: (1) as part of a well-formed UTF-8 byte sequence, reported to the user-provided output expressions (E1 E1* ...), or (2) as part of a maximal subpart of an ill-formed subsequence, reported to the user-provided error expressions (E2 E2* ...), or (3) as part of NEW-STATE, or (4) as part of the bytevector starting at index NEW-POS." (let ((bv bv-expr) (end end-expr)) (define (output code-point i j ...) e1 e1* ...) (define (error maximal-subpart i j ...) e2 e2* ...) (define (continue i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (cond ((<= byte #x7F) (output byte (+ i 1) j ...)) ((<= #xC2 byte #xF4) (got-1 byte (+ i 1) j ...)) (else (error byte (+ i 1) j ...)))) (values 0 i j ...))) (define (got-1 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (cond ((not (<= #x80 byte #xBF)) (error state i j ...)) ((<= state #xDF) (output (logior (ash (logand state #x1F) 6) (logand byte #x3F)) (+ i 1) j ...)) (else (let ((state^ (logior (ash state 8) byte))) (cond ((or (<= #xE0A0 state^ #xED9F) (<= #xEE80 state^ #xEFBF)) (got-2/3 state^ (+ i 1) j ...)) ((<= #xF090 state^ #xF48F) (got-2/4 state^ (+ i 1) j ...)) (else (error state i j ...))))))) (values state i j ...))) (define (got-2/3 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (output (logior (ash (logand state #xF00) 4) (ash (logand state #x3F) 6) (logand byte #x3F)) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (got-2/4 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (got-3/4 (logior (ash state 8) byte) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (got-3/4 state i j ...) (if (< i end) (let ((byte (bytevector-u8-ref bv i))) (if (<= #x80 byte #xBF) (output (logior (ash (logand state #x70000) 2) (ash (logand state #x3F00) 4) (ash (logand state #x3F) 6) (logand byte #x3F)) (+ i 1) j ...) (error state i j ...))) (values state i j ...))) (define (enter state i j ...) (cond ((zero? state) (continue i j ...)) ((<= state #xF4) (got-1 state i j ...)) ((<= state #xEFBF) (got-2/3 state i j ...)) ((<= state #xF48F) (got-2/4 state i j ...)) (else (got-3/4 state i j ...)))) (enter state-expr start-expr init-expr ...))) (define (utf8->string! state source source-start source-end target target-start target-end) "Let BYTE-STR denote the concatenation of the following two byte strings: (1) the bytes encoded in STATE, and (2) the bytevector SOURCE beginning with index SOURCE-START (inclusive) and ending with index SOURCE-END (exclusive). STATE must be an exact integer between 0 and #xF48FBF that encodes a proper prefix of a well-formed UTF-8 sequence. The bytes are in big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes the empty string. Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of a well-formed UTF-8 byte sequence, and let BYTE-STR-SANS-TAIL be BYTE-STR with TAIL removed. UTF8->STRING! permissively decodes the Unicode 8-bit string BYTE-STR-SANS-TAIL and writes the resulting characters to the string TARGET beginning with index TARGET-START (inclusive) and ending with index TARGET-END (exclusive). In case of decoding errors, each 'maximal subpart of an ill-formed subsequence', as defined in section 3.9 of The Unicode Standard 12.0, is replaced with a Unicode replacement character (U+FFFD). UTF8->STRING! returns three values (NEW-STATE SOURCE-POS TARGET-POS). If the target string is able to hold all of the decoded characters and replacement characters, then NEW-STATE encodes the bytes in TAIL, SOURCE-POS equals SOURCE-END, and TARGET-POS equals TARGET-START plus the number of characters written. If there's not enough space in the target string, then NEW-STATE is 0, SOURCE-POS is the index of the first byte that is not represented by the characters written, and TARGET-POS equals TARGET-END." (if (< target-start target-end) (utf8-decode ((j target-start)) (i continue) (output (code-point) (string-set! target j (integer->char code-point)) (if (< (+ j 1) target-end) (continue i (+ j 1)) (values 0 i (+ j 1)))) (error (maximal-subpart) (output #xFFFD i j)) ;TODO: support other error handlers state source source-start source-end) (values state source-start target-start))) ;; Another experimental primitive, slower than the ones above. (define* (utf8-fold* out err seed state bv #:optional (start 0) (end (bytevector-length bv))) "Let BYTE-STR denote the concatenation of the following two byte strings: (1) the bytes encoded in STATE, and (2) the bytevector BV beginning with index START (inclusive) and ending with index END (exclusive). STATE must be an exact integer between 0 and #xF48FBF that encodes a proper prefix of a well-formed UTF-8 sequence. The bytes are in big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes the empty string. Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of a well-formed UTF-8 byte sequence. Let ITEMS be a list of strings whose concatenation equals BYTE-STR with TAIL removed, such that each element in ITEMS is either (1) a well-formed UTF-8 byte sequence, or (2) a maximal subpart of an ill-formed subsequence, as defined in section 3.9 of The Unicode Standard 12.0, i.e. the longest code unit subsequence starting at an inconvertible offset that is either (a) the initial subsequence of a well-formed code unit sequence, or (b) a subsequence of length one. UTF8-FOLD* iterates over ITEMS from left to right, calling OUT for each well-formed UTF-8 byte sequence, and ERR for each maximal subpart of an ill-formed subsequence. For each well-formed UTF-8 byte sequence, (OUT CODE-POINT INDEX SEED K) is called, where CODE-POINT is the Unicode scalar value as an exact integer, INDEX is the bytevector index immediately following the decoded code point, SEED is the current seed value, and K is the continuation. OUT may choose to either continue decoding or to exit the loop. To continue decoding, call (K INDEX NEW-SEED). To exit, return (values 0 INDEX FINAL-SEED). For each maximal subpart of an ill-formed UTF-8 byte sequence, (ERR MAXIMAL-SUBPART INDEX SEED K) is called, where MAXIMAL-SUBPART is an exact integer between #xC2 and #xF48FBF containing the bytes in big-endian order, and INDEX is the bytevector index immediately following those bytes. For example, #xF48FBF represents the byte string (F4 8F BF). Like OUT, ERR may either call (K INDEX NEW-SEED) to continue, or return (values 0 INDEX FINAL-SEED) to exit the loop. If the end of BYTE-STR is reached, UTF8-FOLD* returns the values (NEW-STATE END FINAL-SEED), where NEW-STATE encodes the bytes in TAIL. In any case, if the user-provided procedures behave as specified above, then when UTF8-FOLD* returns values (NEW-STATE NEW-POS FINAL-SEED), every byte in BYTE-STR will have been reported in exactly one of the following ways: (1) as part of a well-formed UTF-8 byte sequence, reported to OUT, (2) as part of a maximal subpart of an ill-formed subsequence, reported to ERR, (3) as part of NEW-STATE, or (4) as part of the bytevector starting at index NEW-POS." (utf8-decode ((j seed)) (i continue) (output (code-point) (out code-point i j continue)) (error (maximal-subpart) (err maximal-subpart i j continue)) state bv start end)) ;; Another experimental primitive, slower than the ones above. (define* (utf8-fold out err seed state bv #:optional (start 0) (end (bytevector-length bv))) "Let BYTE-STR denote the concatenation of the following two byte strings: (1) the bytes encoded in STATE, and (2) the bytevector BV beginning with index START (inclusive) and ending with index END (exclusive). STATE must be an exact integer between 0 and #xF48FBF that encodes a proper prefix of a well-formed UTF-8 sequence. The bytes are in big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes the empty string. Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of a well-formed UTF-8 byte sequence. Let ITEMS be a list of strings whose concatenation equals BYTE-STR with TAIL removed, such that each element in ITEMS is either (1) a well-formed UTF-8 byte sequence, or (2) a maximal subpart of an ill-formed subsequence, as defined in section 3.9 of The Unicode Standard 12.0, i.e. the longest code unit subsequence starting at an inconvertible offset that is either (a) the initial subsequence of a well-formed code unit sequence, or (b) a subsequence of length one. UTF8-FOLD iterates over ITEMS from left to right, calling OUT for each well-formed UTF-8 byte sequence, and ERR for each maximal subpart of an ill-formed subsequence. For each well-formed UTF-8 byte sequence, (OUT CODE-POINT INDEX SEED) is called, where CODE-POINT is the Unicode scalar value as an exact integer, INDEX is the bytevector index immediately following the decoded code point, and SEED is the current seed value. OUT should return the new seed value. For each maximal subpart of an ill-formed UTF-8 byte sequence, (ERR MAXIMAL-SUBPART INDEX SEED) is called, where MAXIMAL-SUBPART is an exact integer between #xC2 and #xF48FBF containing the bytes in big-endian order, and INDEX is the bytevector index immediately following those bytes. For example, #xF48FBF represents the byte string (F4 8F BF). ERR should return two values: (CONTINUE? NEW-SEED), where CONTINUE? is a boolean specifying whether to continue the loop. If the end of BYTE-STR is reached, UTF8-FOLD returns the values (NEW-STATE END FINAL-SEED), where NEW-STATE encodes the bytes in TAIL. In any case, if the user-provided procedures behave as specified above, then when UTF8-FOLD returns values (NEW-STATE NEW-POS FINAL-SEED), every byte in BYTE-STR will have been reported in exactly one of the following ways: (1) as part of a well-formed UTF-8 byte sequence, reported to OUT, (2) as part of a maximal subpart of an ill-formed subsequence, reported to ERR, (3) as part of NEW-STATE, or (4) as part of the bytevector starting at index NEW-POS." (utf8-fold* (lambda (code-point i j continue) (continue i (out code-point i j))) (lambda (maximal-subpart i j continue) (call-with-values (lambda () (err maximal-subpart i j)) (lambda (continue? j^) (if continue? (continue i j^) (values 0 i j^))))) seed state bv start end)) ;; A not-so-quick test of all valid characters. ;; TODO: Tests of strictness and error handling. #; (let () (define ss (string-tabulate (lambda (i) (if (< i #xD800) (integer->char i) (integer->char (+ i #x800)))) (- #x110000 #x800))) (define bv (string->utf8 ss)) (define bv-len (bytevector-length bv)) (define slen (* 2 (string-length ss))) (define s (make-string slen)) (every (lambda (incr) (string-fill! s #\a) (call-with-values (lambda () (let loop ((state 0) (i 0) (j 0)) (if (< i bv-len) (call-with-values (lambda () (utf8->string! state bv i (min bv-len (+ i incr)) s j slen)) loop) (values state i j)))) (lambda (state i j) (and (zero? state) (= i bv-len) (= j (string-length ss)) (string=? ss (substring s 0 j)))))) (iota 5 1)))