From 375576fcc57b77915e703d1103ac61a3cabadba2 Mon Sep 17 00:00:00 2001
From: Dianne Skoll <dianne@skoll.ca>
Date: Wed, 10 Sep 2025 13:25:27 -0400
Subject: [PATCH] Add better support for multi-byte character set.

The functions mbindex, mbstrlen and mbsubstr are just like
index, strlen and substr, but they use and return 1-based
character indexes rather than 1-based byte indexes.
---
 contrib/remind-conf-mode/remind-conf-mode.el |   3 +-
 man/remind.1.in                              |  59 +++++--
 src/err.h                                    |   4 +
 src/funcs.c                                  | 166 +++++++++++++++++++
 tests/test-rem                               |   6 +-
 tests/test.cmp                               |  42 ++++-
 tests/test.rem                               |  27 +++
 7 files changed, 292 insertions(+), 15 deletions(-)

diff --git a/contrib/remind-conf-mode/remind-conf-mode.el b/contrib/remind-conf-mode/remind-conf-mode.el
index 87b3fb7d..2983b5bd 100644
--- a/contrib/remind-conf-mode/remind-conf-mode.el
+++ b/contrib/remind-conf-mode/remind-conf-mode.el
@@ -171,7 +171,8 @@
          "dusk" "easterdate" "escape" "eval" "evaltrig" "filedate" "filedatetime"
          "filedir" "filename" "getenv" "hebdate" "hebday" "hebmon" "hebyear"
          "hour" "htmlescape" "htmlstriptags" "iif" "index" "isany" "isconst" "isdst"
-         "isleap" "isomitted" "language" "localtoutc" "lower" "max" "min"
+         "isleap" "isomitted" "language" "localtoutc" "lower" "max"
+         "mbindex" "mbstrlen" "mbsubstr" "min"
          "minsfromutc" "minute" "mon" "monnum" "moondate" "moondatetime"
          "moonphase" "moonrise" "moonrisedir" "moonset" "moonsetdir" "moontime"
          "multitrig" "ndawn" "ndusk" "nonconst" "nonomitted" "now" "ord" "orthodoxeaster"
diff --git a/man/remind.1.in b/man/remind.1.in
index fa466380..1f9247fd 100644
--- a/man/remind.1.in
+++ b/man/remind.1.in
@@ -2540,6 +2540,26 @@ word.  The \fBINT\fR data type corresponds to the C "int" type.
 The \fBSTRING\fR data type consists of strings of characters.  It is
 somewhat comparable to a C character array, but more closely resembles
 the string type in BASIC.
+.RS
+.PP
+\fBRemind\fR normally expects to be running in a UTF-8 environment.
+In this environment, there is a difference between \fIbytes\fR and
+\fIcharacters\fR since in UTF-8, a character may be represented by
+a sequence of more than one byte.  For example, in a UTF-8 environment,
+the string "🙂" contains one character but four bytes.  And the string
+"één" contains three characters but five bytes.
+.PP
+\fBRemind\fR has a set of functions
+that work on \fIbytes\fR, namely \fBindex\fR, \fBstrlen\fR and \fBsubstr\fR.
+These are not safe to use on multi-byte strings; instead use
+\fBmbindex\fR, \fBmbstrlen\fR and \fBmbsubstr\fR.  If you know \fIfor sure\fR
+that a string contains only single-byte characters, then the byte-oriented
+versions may be used and are faster than the multi-byte versions.
+.PP
+Some ancient or embedded systems may lack the C library functions needed
+to deal with multi-byte strings.  In that case, the \fBmb\fIxxx\fR functions
+all return an error.
+.RE
 .TP
 .B TIME
 The \fBTIME\fR data type is used for two different purposes:  To represent
@@ -4031,14 +4051,23 @@ compatible with previous versions of \fBRemind\fR.
 .TP
 .B index(s_search, s_target [,i_start)
 Returns an \fBINT\fR that is the location of \fItarget\fR in the
-string \fIsearch\fR.  The first character of a string is numbered 1.
-If \fItarget\fR does not exist in \fIsearch\fR, then 0 is returned.
+string \fIsearch\fR.  Note that \fBindex\fR uses \fIbyte\fR positions,
+not character positions, so should not be used on non-ASCII strings.  Use
+\fBmbindex\fR for non-ASCII strings.
+.PP
+The first byte of a string is numbered 1.  If \fItarget\fR does not
+exist in \fIsearch\fR, then 0 is returned.
 .RS
 .PP
 The optional parameter \fIstart\fR specifies the position in
 \fIsearch\fR at which to start looking for \fItarget\fR.
 .RE
 .TP
+.B mbindex(s_search, s_target [,i_start])
+Similar to \fBindex()\fR but returns the \fIcharacter\fR position rather
+than the \fIbyte\fR position.  Also, \fIstart\fR is interpreted as a
+1-based character index rather than a byte index.
+.TP
 .B isany(arg1 [,arg2, ..., argN]);
 Returns 1 if the first argument \fIarg1\fR is equal to any of the
 subsequent arguments \fIarg2\fR through \fIargN\fR; returns 0 otherwise.
@@ -4642,17 +4671,27 @@ output is not going to a TTY.
 .RE
 .TP
 .B strlen(s_str)
-Returns the length of \fIstr\fR.  If the length of \fIstr\fR is too large
-to represent as an integer, emits a "Number too high" error.  Note that
-\fBstrlen\fR returns the number of \fIbytes\fR in the string, not the
-number of \fIcharacters\fR.  These numbers are the same for ASCII strings,
-but may be different for UTF-8 strings.
+Returns the length of \fIstr\fR in bytes.  If the length of \fIstr\fR
+is too large to represent as an integer, emits a "Number too high"
+error.  Note that \fBstrlen\fR returns the number of \fIbytes\fR in
+the string, not the number of \fIcharacters\fR.  These numbers are the
+same for ASCII strings, but may be different for UTF-8 strings.
+.TP
+.B mbstrlen(str)
+Similar to \fBstrlen\fR, but returns the length of the string in
+\fIcharacters\fR rather than \fIbytes\fR and is thus safe for use
+on multi-byte strings.
 .TP
 .B substr(s_str, i_start [,i_end])
-Returns a \fBSTRING\fR consisting of all characters in \fIstr\fR from
-\fIstart\fR up to and including \fIend\fR.  Characters are numbered
+Returns a \fBSTRING\fR consisting of all bytes in \fIstr\fR from
+\fIstart\fR up to and including \fIend\fR.  Bytes are numbered
 from 1.  If \fIend\fR is not supplied, then it defaults to the length
-of \fIstr\fR.
+of \fIstr\fR.  Because \fBsubstr\fR uses \fIbyte\fR indexes rather than
+\fIcharacter\fR indexes, it should not be used on multi-byte strings.
+.TP
+.B mbsubstr(s_str, i_start [,i_end])
+Similar to \fBsubstr\fR but uses \fIcharacter\fR indexes rather than
+\fIbyte\fR indexes, and is thus safe for use on multi-byte strings.
 .TP
 .B sunrise([dq_date])
 Returns a \fBTIME\fR indicating the time of sunrise on the specified
diff --git a/src/err.h b/src/err.h
index 861b4e64..48b3b691 100644
--- a/src/err.h
+++ b/src/err.h
@@ -134,6 +134,8 @@
 #define E_MAX_OVERDUE_WITHOUT_TODO 110
 #define E_TZ_SPECIFIED_TWICE 111
 #define E_TZ_NO_AT          112
+#define E_NO_MB             113
+#define E_BAD_MB_SEQ        114
 
 #ifdef MK_GLOBALS
 #undef EXTERN
@@ -265,6 +267,8 @@ EXTERN char *ErrMsg[]
 /* E_MAX_OVERDUE_WITHOUT_TODO */ "MAX-OVERDUE specified without TODO",
 /* E_TZ_SPECIFIED_TWICE */ "TZ specified twice",
 /* E_TZ_NO_AT */           "TZ specified for non-timed reminder",
+/* E_NO_MB */              "C library does not support multibyte characters",
+/* E_BAD_MB_SEQ */         "Invalid multibyte sequence",
 }
 #endif /* MK_GLOBALS */
 ;
diff --git a/src/funcs.c b/src/funcs.c
index 4ccc3cf7..3729b99f 100644
--- a/src/funcs.c
+++ b/src/funcs.c
@@ -126,6 +126,9 @@ static int FLanguage       (func_info *);
 static int FLocalToUTC     (func_info *);
 static int FLower          (func_info *);
 static int FMax            (func_info *);
+static int FMbindex        (func_info *);
+static int FMbstrlen       (func_info *);
+static int FMbsubstr       (func_info *);
 static int FMin            (func_info *);
 static int FMinsfromutc    (func_info *);
 static int FMinute         (func_info *);
@@ -305,6 +308,9 @@ BuiltinFunc Func[] = {
     {   "localtoutc",   1,      1,      1,          FLocalToUTC, NULL },
     {   "lower",        1,      1,      1,          FLower, NULL },
     {   "max",          1,      NO_MAX, 1,          FMax, NULL },
+    {   "mbindex",      2,      3,      1,          FMbindex, NULL },
+    {   "mbstrlen",     1,      1,      1,          FMbstrlen, NULL },
+    {   "mbsubstr",     2,      3,      1,          FMbsubstr, NULL },
     {   "min",          1,      NO_MAX, 1,          FMin, NULL },
     {   "minsfromutc",  0,      2,      0,          FMinsfromutc, NULL },
     {   "minute",       1,      1,      1,          FMinute, NULL },
@@ -476,6 +482,29 @@ static int FStrlen(func_info *info)
     return OK;
 }
 
+/***************************************************************/
+/*                                                             */
+/*  FMBstrlen - string length in wide characters               */
+/*                                                             */
+/***************************************************************/
+static int FMbstrlen(func_info *info)
+{
+#ifdef REM_USE_WCHAR
+    ASSERT_TYPE(0, STR_TYPE);
+    RetVal.type = INT_TYPE;
+    size_t l = mbstowcs(NULL, ARGSTR(0), 0);
+    if (l == (size_t) -1) {
+        return E_BAD_MB_SEQ;
+    }
+    if (l > INT_MAX) return E_2HIGH;
+    RETVAL = (int) l;
+    return OK;
+#else
+    RetVal.type = ERR_TYPE;
+    return E_NO_MB;
+#endif
+}
+
 /***************************************************************/
 /*                                                             */
 /*  FBaseyr - system base year                                 */
@@ -2378,6 +2407,76 @@ static int FSubstr(func_info *info)
     return RetStrVal(t, info);
 }
 
+/***************************************************************/
+/*                                                             */
+/*  FMbubstr                                                   */
+/*                                                             */
+/*  The mbsubstr function.                                     */
+/*                                                             */
+/***************************************************************/
+static int FMbsubstr(func_info *info)
+{
+#ifdef REM_USE_WCHAR
+    wchar_t *str;
+    wchar_t *s;
+    wchar_t const *t;
+    size_t mblen;
+    char *converted;
+    size_t len;
+    int start;
+    int end;
+
+    if (ARG(0).type != STR_TYPE || ARG(1).type != INT_TYPE) return E_BAD_TYPE;
+    if (Nargs == 3 && ARG(2).type != INT_TYPE) return E_BAD_TYPE;
+
+    mblen = mbstowcs(NULL, ARGSTR(0), 0);
+    if (mblen == (size_t) -1) {
+        return E_BAD_MB_SEQ;
+    }
+    str = calloc(mblen+1, sizeof(wchar_t));
+    if (!str) {
+        return E_NO_MEM;
+    }
+    (void) mbstowcs(str, ARGSTR(0), mblen+1);
+    s = str;
+    start = 1;
+    while (start < ARGV(1)) {
+        if (!*s) break;
+        s++;
+        start++;
+    }
+    t = s;
+    if (Nargs >= 3) {
+        end = start;
+        while (end <= ARGV(2)) {
+            if (!*s) break;
+            s++;
+            end++;
+        }
+        *s = (wchar_t) 0;
+    }
+
+    len = wcstombs(NULL, t, 0);
+    if (len == (size_t) -1) {
+        free( (void *) str);
+        return E_BAD_MB_SEQ;
+    }
+    converted = malloc(len+1);
+    if (!converted) {
+        free( (void *) str);
+        return E_NO_MEM;
+    }
+    (void) wcstombs(converted, t, len+1);
+    RetVal.type = STR_TYPE;
+    RetVal.v.str = converted;
+    free( (void *) str);
+    return OK;
+#else
+    RetVal.type = ERR_TYPE;
+    return E_NO_MB;
+#endif
+}
+
 /***************************************************************/
 /*                                                             */
 /*  FIndex                                                     */
@@ -2416,6 +2515,73 @@ static int FIndex(func_info *info)
     return OK;
 }
 
+/***************************************************************/
+/*                                                             */
+/*  FMbindex                                                   */
+/*                                                             */
+/*  The wide-char of one string embedded in another.           */
+/*                                                             */
+/***************************************************************/
+static int FMbindex(func_info *info)
+{
+#ifdef REM_USE_WCHAR
+    wchar_t *haystack;
+    wchar_t *needle;
+    wchar_t const *s;
+    size_t haylen, needlelen;
+
+    if (ARG(0).type != STR_TYPE || ARG(1).type != STR_TYPE ||
+        (Nargs == 3 && ARG(2).type != INT_TYPE)) return E_BAD_TYPE;
+
+    haylen = mbstowcs(NULL, ARGSTR(0), INT_MAX);
+    if (haylen == (size_t) -1) {
+        return E_BAD_MB_SEQ;
+    }
+    haystack = calloc(haylen+1, sizeof(wchar_t));
+    if (!haystack) {
+        return E_NO_MEM;
+    }
+    (void) mbstowcs(haystack, ARGSTR(0), haylen+1);
+    needlelen = mbstowcs(NULL, ARGSTR(1), INT_MAX);
+    if (needlelen == (size_t) -1) {
+        return E_BAD_MB_SEQ;
+    }
+    needle = calloc(needlelen+1, sizeof(wchar_t));
+    if (!needle) {
+        free( (void *) haystack);
+        return E_NO_MEM;
+    }
+    (void) mbstowcs(needle, ARGSTR(1), needlelen+1);
+    s = haystack;
+
+/* If 3 args, bump up the start */
+    if (Nargs == 3) {
+        if (ARGV(2) > (int) haylen) {
+            s += haylen;
+        } else {
+            s += ARGV(2) - 1;
+        }
+    }
+
+/* Find the string */
+    RetVal.type = INT_TYPE;
+    s = wcsstr(s, needle);
+    if (!s) {
+        free( (void *) haystack);
+        free( (void *) needle);
+        RETVAL = 0;
+        return OK;
+    }
+    RETVAL = s - haystack + 1;
+    free( (void *) haystack);
+    free( (void *) needle);
+    return OK;
+#else
+    RetVal.type = ERR_TYPE;
+    return E_NO_MB;
+#endif
+}
+
 /***************************************************************/
 /*                                                             */
 /*  FIif                                                       */
diff --git a/tests/test-rem b/tests/test-rem
index f285781b..9b9953a8 100644
--- a/tests/test-rem
+++ b/tests/test-rem
@@ -59,7 +59,7 @@ chmod 000 include_dir/04cantread.rem
 TEST_GETENV="foo bar baz" ; export TEST_GETENV
 echo "Test 1" > ../tests/test.out
 echo "" >> ../tests/test.out
-../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v 'TimetIs64bit' >> ../tests/test.out
+../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v -a 'TimetIs64bit' >> ../tests/test.out 2>&1
 echo "" >> ../tests/test.out
 echo "Test 2" >> ../tests/test.out
 echo "" >> ../tests/test.out
@@ -618,7 +618,7 @@ rm -f ../tests/once.timestamp
 ../src/remind --flush -q ../tests/dedupe.rem 8 November 2023 >> ../tests/test.out 2>&1
 
 # Remove references to SysInclude, which is build-specific
-grep -F -v '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
+grep -F -v -a '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
 
 # If "man" accepts the --warnings flag, test all the man pages.
 RUNMAN=0
@@ -785,7 +785,7 @@ echo "... and here is stderr" >> ../tests/test.out 2>&1
 
 # Test %: substitution sequence in all the languages
 for i in ../include/lang/??.rem ; do
-    ../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep 2025/ >> ../tests/test.out
+    ../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep -a 2025/ >> ../tests/test.out
 DO [i]
 REM TODO 2025-08-13 MSG %(LANGID) Task1%:
 REM TODO 2025-08-13 COMPLETE-THROUGH 2025-08-12 MSG %(LANGID) Task2%:
diff --git a/tests/test.cmp b/tests/test.cmp
index fa446b73..71b19e8b 100644
--- a/tests/test.cmp
+++ b/tests/test.cmp
@@ -16592,8 +16592,43 @@ Leaving UserFN c() => 33
 
 DEBUG -xe
 Overridden: subst_colon subst_bang subst_question subst_at subst_hash
+bad => "�"
+mbstrlen("�") => Invalid multibyte sequence
+../tests/test.rem(1734): mbstrlen(): Invalid multibyte sequence
+bad => "�"
+strlen("�") => 1
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+mbstrlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 15
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+strlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 34
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 9
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 24
+bad => "�"
+bad => "�"
+mbindex("�", "�") => Invalid multibyte sequence
+../tests/test.rem(1742): mbindex(): Invalid multibyte sequence
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 11) => 11
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 25) => 26
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "🙂🙂🙂🙂xyzçççéfoo"
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "🙂🙂🙂🙂xyzç"
+bad => "�"
+mbsubstr("�", 1) => Invalid multibyte sequence
+../tests/test.rem(1749): mbsubstr(): Invalid multibyte sequence
+bad => "�"
+mbsubstr("�", 1, 20) => Invalid multibyte sequence
+../tests/test.rem(1750): mbsubstr(): Invalid multibyte sequence
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "���🙂🙂🙂🙂xyzçççéfoo"
+faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
+substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "���🙂�"
 Variable hash table statistics:
-  Entries: 100144; Buckets: 87719; Non-empty Buckets: 66301
+  Entries: 100146; Buckets: 87719; Non-empty Buckets: 66303
   Maxlen: 5; Minlen: 0; Avglen: 1.142; Stddev: 0.878; Avg nonempty len: 1.510
   Growths: 13; Shrinks: 0
 Function hash table statistics:
@@ -24589,6 +24624,9 @@ language
 localtoutc
 lower
 max
+mbindex
+mbstrlen
+mbsubstr
 min
 minsfromutc
 minute
@@ -24993,6 +25031,8 @@ TRANSLATE "MAX-OVERDUE specified twice" ""
 TRANSLATE "MAX-OVERDUE specified without TODO" ""
 TRANSLATE "TZ specified twice" ""
 TRANSLATE "TZ specified for non-timed reminder" ""
+TRANSLATE "C library does not support multibyte characters" ""
+TRANSLATE "Invalid multibyte sequence" ""
 
 # Other Messages
 TRANSLATE "%s function `%s' defined at %s(%s) does not use its argument" ""
diff --git a/tests/test.rem b/tests/test.rem
index 46971278..c6e39e65 100644
--- a/tests/test.rem
+++ b/tests/test.rem
@@ -1725,7 +1725,34 @@ fset subst_hash(a, b, c) "subst_hash"
 
 REM MSG Overridden: %: %! %? %@ %#
 
+# mbstrlen and friends
+DEBUG -xe
+set bad char(255)
+set faces "🙂" * 5 + "xyz" + "çççéfoo"
 
+DEBUG +x
+set a mbstrlen(bad)
+set a strlen(bad)
+
+set a mbstrlen(faces)
+set a strlen(faces)
+
+set a mbindex(faces, "ç")
+set a index(faces, "ç")
+set a mbindex(bad, bad)
+
+set a mbindex(faces, "ç", 11)
+set a index(faces, "ç", 25)
+
+set a mbsubstr(faces, 2)
+set a mbsubstr(faces, 2, 9)
+set a mbsubstr(bad, 1)
+set a mbsubstr(bad, 1, 20)
+
+set a substr(faces, 2)
+set a substr(faces, 2, 9)
+
+DEBUG -x
 # Don't want Remind to queue reminders
 EXIT