;;; Copyright © 2019 Mark H Weaver <mhw@netris.org>
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation, either version 3 of the License, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

(use-modules (rnrs bytevectors)
             ;; the following modules are needed only for the test.
             ;;(srfi srfi-1)
             ;;(ice-9 iconv)
             )

;; Well-formed UTF-8 sequences
;; ===========================
;; 00..7F
;; C2..DF   80..BF
;; E0      *A0..BF   80..BF
;; E1..EC   80..BF   80..BF
;; ED       80..9F*  80..BF
;; EE..EF   80..BF   80..BF
;; F0      *90..BF   80..BF   80..BF
;; F1..F3   80..BF   80..BF   80..BF
;; F4       80..8F*  80..BF   80..BF

;; UTF-8 Decoder states
;; ====================
;;  0                 start state
;;  C2     .. DF      got 1/2 bytes
;;  E0     .. EF      got 1/3 bytes
;;  F0     .. F4      got 1/4 bytes
;;  E0A0   .. ED9F    got 2/3 bytes (range 1)
;;  EE80   .. EFBF    got 2/3 bytes (range 2)
;;  F090   .. F48F    got 2/4 bytes
;;  F09080 .. F48FBF  got 3/4 bytes

(define-syntax-rule (utf8-decode ((j init-expr) ...)
                                 (i continue)
                                 (output (code-point) e1 e1* ...)
                                 (error (maximal-subpart) e2 e2* ...)
                                 state-expr bv-expr start-expr end-expr)
  "Let BYTE-STR denote the concatenation of the following two byte
strings: (1) the bytes encoded in STATE-EXPR, and (2) the bytevector
BV-EXPR beginning with index START-EXPR (inclusive) and ending with
index END-EXPR (exclusive).

STATE-EXPR must evaluate to an exact integer between 0 and #xF48FBF
that encodes a proper prefix of a well-formed UTF-8 sequence.  The
bytes are in big-endian order, e.g. #xF48FBF encodes (F4 8F BF) and
0 encodes the empty string.

Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of
a well-formed UTF-8 byte sequence.

Let ITEMS be a list of strings whose concatenation equals BYTE-STR
with TAIL removed, such that each element in ITEMS is either (1) a
well-formed UTF-8 byte sequence, or (2) a maximal subpart of an
ill-formed subsequence, as defined in section 3.9 of The Unicode
Standard 12.0, i.e. the longest code unit subsequence starting at an
inconvertible offset that is either (a) the initial subsequence of a
well-formed code unit sequence, or (b) a subsequence of length one.

UTF8-DECODE iterates over ITEMS from left to right, evaluating the
'output' expressions (E1 E1* ...) for each well-formed UTF-8 byte
sequence, and the 'error' expressions (E2 E2* ...) for each maximal
subpart of an ill-formed subsequence.

As with a 'fold' operation, zero or more seeds are included in the
iterator state, bound to variables (J ...) with initial values
(INIT-EXPR ...).  Each of the user-provided expression sequences
(E1 E1* ...) and (E2 E2* ...) have access to the current seed values
(J ...), and produce new seed values each time they are called.

The user-provided expression sequences have access to the following
variables, whose identifiers are specified by operands to UTF8-DECODE:

(1) I, the bytevector index immediately following the current item,
    i.e. where decoding should resume after processing this item.

(2) (J ...) the user-provided seed values.

(3) CODE-POINT [only bound in (E1 E1* ...)], an exact integer
    Unicode scalar value.

(4) MAXIMAL-SUBPART [only bound in (E2 E2* ...)], the maximal subpart
    of the ill-formed subsequence, represented as an exact integer
    from #xC2 to #xF48FBF containing the bytes in big-endian order.
    For example, #xF48FBF represents the byte string (F4 8F BF).

(5) CONTINUE, a procedure which may be applied to arguments (I J^ ...)
    by the user-provided expression sequences to continue decoding
    with the new seed values (J^ ...).

(6) OUTPUT, a procedure which may be applied to arguments
    (CODE-POINT I J^ ...) to evaluate the user-provided expression
    sequence (E1 E1* ...).

(7) ERROR, a procedure which may be applied to arguments
    (MAXIMAL-SUBPART I J^ ...) to evaluate the user-provided expression
    sequence (E2 E2* ...).

Each user-provided expression sequence may choose either to continue
the loop by calling (CONTINUE I J^ ...), or to terminate the loop by
returning (values 0 I J^ ...), where (J^ ...) are the new seed values.

OUTPUT and ERROR are included for convenience, to allow the output and
error expression sequences to call each other.  For example, it may be
convenient for the error expression sequence to end with:

  (output #xFFFD i j ...)

If the end of BYTE-STR is reached, UTF8-DECODE returns the values
(NEW-STATE END J^ ...), where NEW-STATE encodes the bytes in TAIL,
END equals END-EXPR, and (J^ ...) are the final seed values.

In any case, if the user-provided expressions behave as specified
above, then when UTF8-DECODE returns values (NEW-STATE NEW-POS J^ ...),
every byte in BYTE-STR will have been reported in exactly one of the
following ways:

(1) as part of a well-formed UTF-8 byte sequence, reported to the
    user-provided output expressions (E1 E1* ...), or
(2) as part of a maximal subpart of an ill-formed subsequence,
    reported to the user-provided error expressions (E2 E2* ...), or
(3) as part of NEW-STATE, or
(4) as part of the bytevector starting at index NEW-POS."

  (let ((bv   bv-expr)
        (end  end-expr))

    (define (output code-point i j ...)
      e1 e1* ...)

    (define (error maximal-subpart i j ...)
      e2 e2* ...)
    
    (define (continue i j ...)
      (if (< i end)
          (let ((byte (bytevector-u8-ref bv i)))
            (cond ((<= byte #x7F)       (output byte (+ i 1) j ...))
                  ((<= #xC2 byte #xF4)  (got-1 byte (+ i 1) j ...))
                  (else                 (error byte (+ i 1) j ...))))
          (values 0 i j ...)))

    (define (got-1 state i j ...)
      (if (< i end)
          (let ((byte (bytevector-u8-ref bv i)))
            (cond ((not (<= #x80 byte #xBF))
                   (error state i j ...))
                  ((<= state #xDF)
                   (output (logior (ash (logand state #x1F) 6)
                                   (logand byte #x3F))
                           (+ i 1) j ...))
                  (else
                   (let ((state^ (logior (ash state 8) byte)))
                     (cond ((or (<= #xE0A0 state^ #xED9F)
                                (<= #xEE80 state^ #xEFBF))
                            (got-2/3 state^ (+ i 1) j ...))
                           ((<= #xF090 state^ #xF48F)
                            (got-2/4 state^ (+ i 1) j ...))
                           (else
                            (error state i j ...)))))))
          (values state i j ...)))

    (define (got-2/3 state i j ...)
      (if (< i end)
          (let ((byte (bytevector-u8-ref bv i)))
            (if (<= #x80 byte #xBF)
                (output (logior (ash (logand state #xF00) 4)
                                (ash (logand state #x3F) 6)
                                (logand byte #x3F))
                        (+ i 1) j ...)
                (error state i j ...)))
          (values state i j ...)))

    (define (got-2/4 state i j ...)
      (if (< i end)
          (let ((byte (bytevector-u8-ref bv i)))
            (if (<= #x80 byte #xBF)
                (got-3/4 (logior (ash state 8) byte) (+ i 1) j ...)
                (error state i j ...)))
          (values state i j ...)))

    (define (got-3/4 state i j ...)
      (if (< i end)
          (let ((byte (bytevector-u8-ref bv i)))
            (if (<= #x80 byte #xBF)
                (output (logior (ash (logand state #x70000) 2)
                                (ash (logand state #x3F00) 4)
                                (ash (logand state #x3F) 6)
                                (logand byte #x3F))
                        (+ i 1) j ...)
                (error state i j ...)))
          (values state i j ...)))

    (define (enter state i j ...)
      (cond ((zero? state)      (continue i j ...))
            ((<= state #xF4)    (got-1 state i j ...))
            ((<= state #xEFBF)  (got-2/3 state i j ...))
            ((<= state #xF48F)  (got-2/4 state i j ...))
            (else               (got-3/4 state i j ...))))

    (enter state-expr start-expr init-expr ...)))

(define (utf8->string! state source source-start source-end
                       target target-start target-end)
  "Let BYTE-STR denote the concatenation of the following two byte
strings: (1) the bytes encoded in STATE, and (2) the bytevector
SOURCE beginning with index SOURCE-START (inclusive) and ending with
index SOURCE-END (exclusive).

STATE must be an exact integer between 0 and #xF48FBF that encodes
a proper prefix of a well-formed UTF-8 sequence.  The bytes are in
big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes the
empty string.

Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of
a well-formed UTF-8 byte sequence, and let BYTE-STR-SANS-TAIL be
BYTE-STR with TAIL removed.

UTF8->STRING! permissively decodes the Unicode 8-bit string
BYTE-STR-SANS-TAIL and writes the resulting characters to the string
TARGET beginning with index TARGET-START (inclusive) and ending with
index TARGET-END (exclusive).

In case of decoding errors, each 'maximal subpart of an ill-formed
subsequence', as defined in section 3.9 of The Unicode Standard 12.0,
is replaced with a Unicode replacement character (U+FFFD).

UTF8->STRING! returns three values (NEW-STATE SOURCE-POS TARGET-POS).

If the target string is able to hold all of the decoded characters and
replacement characters, then NEW-STATE encodes the bytes in TAIL,
SOURCE-POS equals SOURCE-END, and TARGET-POS equals TARGET-START plus
the number of characters written.

If there's not enough space in the target string, then NEW-STATE is 0,
SOURCE-POS is the index of the first byte that is not represented by
the characters written, and TARGET-POS equals TARGET-END."
  (if (< target-start target-end)
      (utf8-decode ((j target-start))
                   (i continue)
                   (output (code-point)
                           (string-set! target j (integer->char code-point))
                           (if (< (+ j 1) target-end)
                               (continue i (+ j 1))
                               (values 0 i (+ j 1))))
                   (error (maximal-subpart)
                          (output #xFFFD i j)) ;TODO: support other error handlers
                   state source source-start source-end)
      (values state source-start target-start)))

;; Another experimental primitive, slower than the ones above.
(define* (utf8-fold* out err seed state bv
                     #:optional (start 0) (end (bytevector-length bv)))
  "Let BYTE-STR denote the concatenation of the following two byte
strings: (1) the bytes encoded in STATE, and (2) the bytevector BV
beginning with index START (inclusive) and ending with index END
(exclusive).

STATE must be an exact integer between 0 and #xF48FBF that encodes
a proper prefix of a well-formed UTF-8 sequence.  The bytes are in
big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes
the empty string.

Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of
a well-formed UTF-8 byte sequence.

Let ITEMS be a list of strings whose concatenation equals BYTE-STR
with TAIL removed, such that each element in ITEMS is either (1) a
well-formed UTF-8 byte sequence, or (2) a maximal subpart of an
ill-formed subsequence, as defined in section 3.9 of The Unicode
Standard 12.0, i.e. the longest code unit subsequence starting at an
inconvertible offset that is either (a) the initial subsequence of a
well-formed code unit sequence, or (b) a subsequence of length one.

UTF8-FOLD* iterates over ITEMS from left to right, calling OUT for
each well-formed UTF-8 byte sequence, and ERR for each maximal subpart
of an ill-formed subsequence.

For each well-formed UTF-8 byte sequence, (OUT CODE-POINT INDEX SEED K)
is called, where CODE-POINT is the Unicode scalar value as an exact
integer, INDEX is the bytevector index immediately following the
decoded code point, SEED is the current seed value, and K is the
continuation.  OUT may choose to either continue decoding or to
exit the loop.  To continue decoding, call (K INDEX NEW-SEED).
To exit, return (values 0 INDEX FINAL-SEED).

For each maximal subpart of an ill-formed UTF-8 byte sequence,
(ERR MAXIMAL-SUBPART INDEX SEED K) is called, where MAXIMAL-SUBPART
is an exact integer between #xC2 and #xF48FBF containing the bytes
in big-endian order, and INDEX is the bytevector index immediately
following those bytes.  For example, #xF48FBF represents the byte
string (F4 8F BF).  Like OUT, ERR may either call (K INDEX NEW-SEED)
to continue, or return (values 0 INDEX FINAL-SEED) to exit the loop.

If the end of BYTE-STR is reached, UTF8-FOLD* returns the values
(NEW-STATE END FINAL-SEED), where NEW-STATE encodes the bytes in
TAIL.

In any case, if the user-provided procedures behave as specified above,
then when UTF8-FOLD* returns values (NEW-STATE NEW-POS FINAL-SEED),
every byte in BYTE-STR will have been reported in exactly one of the
following ways:

(1) as part of a well-formed UTF-8 byte sequence, reported to OUT,
(2) as part of a maximal subpart of an ill-formed subsequence,
    reported to ERR,
(3) as part of NEW-STATE, or
(4) as part of the bytevector starting at index NEW-POS."
  (utf8-decode ((j seed))
               (i continue)
               (output (code-point)
                       (out code-point i j continue))
               (error (maximal-subpart)
                      (err maximal-subpart i j continue))
               state bv start end))

;; Another experimental primitive, slower than the ones above.
(define* (utf8-fold out err seed state bv
                    #:optional (start 0) (end (bytevector-length bv)))
  "Let BYTE-STR denote the concatenation of the following two byte
strings: (1) the bytes encoded in STATE, and (2) the bytevector BV
beginning with index START (inclusive) and ending with index END
(exclusive).

STATE must be an exact integer between 0 and #xF48FBF that encodes
a proper prefix of a well-formed UTF-8 sequence.  The bytes are in
big-endian order, e.g. #xF48FBF encodes (F4 8F BF), and 0 encodes
the empty string.

Let TAIL be the longest suffix of BYTE-STR that is a proper prefix of
a well-formed UTF-8 byte sequence.

Let ITEMS be a list of strings whose concatenation equals BYTE-STR
with TAIL removed, such that each element in ITEMS is either (1) a
well-formed UTF-8 byte sequence, or (2) a maximal subpart of an
ill-formed subsequence, as defined in section 3.9 of The Unicode
Standard 12.0, i.e. the longest code unit subsequence starting at an
inconvertible offset that is either (a) the initial subsequence of a
well-formed code unit sequence, or (b) a subsequence of length one.

UTF8-FOLD iterates over ITEMS from left to right, calling OUT for
each well-formed UTF-8 byte sequence, and ERR for each maximal subpart
of an ill-formed subsequence.

For each well-formed UTF-8 byte sequence, (OUT CODE-POINT INDEX SEED)
is called, where CODE-POINT is the Unicode scalar value as an exact
integer, INDEX is the bytevector index immediately following the
decoded code point, and SEED is the current seed value.  OUT should
return the new seed value.

For each maximal subpart of an ill-formed UTF-8 byte sequence,
(ERR MAXIMAL-SUBPART INDEX SEED) is called, where MAXIMAL-SUBPART
is an exact integer between #xC2 and #xF48FBF containing the bytes
in big-endian order, and INDEX is the bytevector index immediately
following those bytes.  For example, #xF48FBF represents the byte
string (F4 8F BF).  ERR should return two values: (CONTINUE? NEW-SEED),
where CONTINUE? is a boolean specifying whether to continue the loop.

If the end of BYTE-STR is reached, UTF8-FOLD returns the values
(NEW-STATE END FINAL-SEED), where NEW-STATE encodes the bytes in
TAIL.

In any case, if the user-provided procedures behave as specified above,
then when UTF8-FOLD returns values (NEW-STATE NEW-POS FINAL-SEED),
every byte in BYTE-STR will have been reported in exactly one of the
following ways:

(1) as part of a well-formed UTF-8 byte sequence, reported to OUT,
(2) as part of a maximal subpart of an ill-formed subsequence,
    reported to ERR,
(3) as part of NEW-STATE, or
(4) as part of the bytevector starting at index NEW-POS."
  (utf8-fold* (lambda (code-point i j continue)
                (continue i (out code-point i j)))
              (lambda (maximal-subpart i j continue)
                (call-with-values (lambda () (err maximal-subpart i j))
                  (lambda (continue? j^)
                    (if continue?
                        (continue i j^)
                        (values 0 i j^)))))
              seed state bv start end))

;; A not-so-quick test of all valid characters.
;; TODO: Tests of strictness and error handling.
#;
(let ()
  (define ss (string-tabulate (lambda (i)
                                (if (< i #xD800)
                                    (integer->char i)
                                    (integer->char (+ i #x800))))
                              (- #x110000 #x800)))
  (define bv (string->utf8 ss))
  (define bv-len (bytevector-length bv))
  (define slen (* 2 (string-length ss)))
  (define s (make-string slen))
  (every (lambda (incr)
           (string-fill! s #\a)
           (call-with-values
               (lambda ()
                 (let loop ((state 0) (i 0) (j 0))
                   (if (< i bv-len)
                       (call-with-values
                           (lambda ()
                             (utf8->string! state bv i (min bv-len
                                                            (+ i incr))
                                            s j slen))
                         loop)
                       (values state i j))))
             (lambda (state i j)
               (and (zero? state)
                    (= i bv-len)
                    (= j (string-length ss))
                    (string=? ss (substring s 0 j))))))
         (iota 5 1)))