From 375576fcc57b77915e703d1103ac61a3cabadba2 Mon Sep 17 00:00:00 2001 From: Dianne Skoll Date: Wed, 10 Sep 2025 13:25:27 -0400 Subject: [PATCH] Add better support for multi-byte character set. The functions mbindex, mbstrlen and mbsubstr are just like index, strlen and substr, but they use and return 1-based character indexes rather than 1-based byte indexes. --- contrib/remind-conf-mode/remind-conf-mode.el | 3 +- man/remind.1.in | 59 +++++-- src/err.h | 4 + src/funcs.c | 166 +++++++++++++++++++ tests/test-rem | 6 +- tests/test.cmp | 42 ++++- tests/test.rem | 27 +++ 7 files changed, 292 insertions(+), 15 deletions(-) diff --git a/contrib/remind-conf-mode/remind-conf-mode.el b/contrib/remind-conf-mode/remind-conf-mode.el index 87b3fb7d..2983b5bd 100644 --- a/contrib/remind-conf-mode/remind-conf-mode.el +++ b/contrib/remind-conf-mode/remind-conf-mode.el @@ -171,7 +171,8 @@ "dusk" "easterdate" "escape" "eval" "evaltrig" "filedate" "filedatetime" "filedir" "filename" "getenv" "hebdate" "hebday" "hebmon" "hebyear" "hour" "htmlescape" "htmlstriptags" "iif" "index" "isany" "isconst" "isdst" - "isleap" "isomitted" "language" "localtoutc" "lower" "max" "min" + "isleap" "isomitted" "language" "localtoutc" "lower" "max" + "mbindex" "mbstrlen" "mbsubstr" "min" "minsfromutc" "minute" "mon" "monnum" "moondate" "moondatetime" "moonphase" "moonrise" "moonrisedir" "moonset" "moonsetdir" "moontime" "multitrig" "ndawn" "ndusk" "nonconst" "nonomitted" "now" "ord" "orthodoxeaster" diff --git a/man/remind.1.in b/man/remind.1.in index fa466380..1f9247fd 100644 --- a/man/remind.1.in +++ b/man/remind.1.in @@ -2540,6 +2540,26 @@ word. The \fBINT\fR data type corresponds to the C "int" type. The \fBSTRING\fR data type consists of strings of characters. It is somewhat comparable to a C character array, but more closely resembles the string type in BASIC. +.RS +.PP +\fBRemind\fR normally expects to be running in a UTF-8 environment. +In this environment, there is a difference between \fIbytes\fR and +\fIcharacters\fR since in UTF-8, a character may be represented by +a sequence of more than one byte. For example, in a UTF-8 environment, +the string "🙂" contains one character but four bytes. And the string +"één" contains three characters but five bytes. +.PP +\fBRemind\fR has a set of functions +that work on \fIbytes\fR, namely \fBindex\fR, \fBstrlen\fR and \fBsubstr\fR. +These are not safe to use on multi-byte strings; instead use +\fBmbindex\fR, \fBmbstrlen\fR and \fBmbsubstr\fR. If you know \fIfor sure\fR +that a string contains only single-byte characters, then the byte-oriented +versions may be used and are faster than the multi-byte versions. +.PP +Some ancient or embedded systems may lack the C library functions needed +to deal with multi-byte strings. In that case, the \fBmb\fIxxx\fR functions +all return an error. +.RE .TP .B TIME The \fBTIME\fR data type is used for two different purposes: To represent @@ -4031,14 +4051,23 @@ compatible with previous versions of \fBRemind\fR. .TP .B index(s_search, s_target [,i_start) Returns an \fBINT\fR that is the location of \fItarget\fR in the -string \fIsearch\fR. The first character of a string is numbered 1. -If \fItarget\fR does not exist in \fIsearch\fR, then 0 is returned. +string \fIsearch\fR. Note that \fBindex\fR uses \fIbyte\fR positions, +not character positions, so should not be used on non-ASCII strings. Use +\fBmbindex\fR for non-ASCII strings. +.PP +The first byte of a string is numbered 1. If \fItarget\fR does not +exist in \fIsearch\fR, then 0 is returned. .RS .PP The optional parameter \fIstart\fR specifies the position in \fIsearch\fR at which to start looking for \fItarget\fR. .RE .TP +.B mbindex(s_search, s_target [,i_start]) +Similar to \fBindex()\fR but returns the \fIcharacter\fR position rather +than the \fIbyte\fR position. Also, \fIstart\fR is interpreted as a +1-based character index rather than a byte index. +.TP .B isany(arg1 [,arg2, ..., argN]); Returns 1 if the first argument \fIarg1\fR is equal to any of the subsequent arguments \fIarg2\fR through \fIargN\fR; returns 0 otherwise. @@ -4642,17 +4671,27 @@ output is not going to a TTY. .RE .TP .B strlen(s_str) -Returns the length of \fIstr\fR. If the length of \fIstr\fR is too large -to represent as an integer, emits a "Number too high" error. Note that -\fBstrlen\fR returns the number of \fIbytes\fR in the string, not the -number of \fIcharacters\fR. These numbers are the same for ASCII strings, -but may be different for UTF-8 strings. +Returns the length of \fIstr\fR in bytes. If the length of \fIstr\fR +is too large to represent as an integer, emits a "Number too high" +error. Note that \fBstrlen\fR returns the number of \fIbytes\fR in +the string, not the number of \fIcharacters\fR. These numbers are the +same for ASCII strings, but may be different for UTF-8 strings. +.TP +.B mbstrlen(str) +Similar to \fBstrlen\fR, but returns the length of the string in +\fIcharacters\fR rather than \fIbytes\fR and is thus safe for use +on multi-byte strings. .TP .B substr(s_str, i_start [,i_end]) -Returns a \fBSTRING\fR consisting of all characters in \fIstr\fR from -\fIstart\fR up to and including \fIend\fR. Characters are numbered +Returns a \fBSTRING\fR consisting of all bytes in \fIstr\fR from +\fIstart\fR up to and including \fIend\fR. Bytes are numbered from 1. If \fIend\fR is not supplied, then it defaults to the length -of \fIstr\fR. +of \fIstr\fR. Because \fBsubstr\fR uses \fIbyte\fR indexes rather than +\fIcharacter\fR indexes, it should not be used on multi-byte strings. +.TP +.B mbsubstr(s_str, i_start [,i_end]) +Similar to \fBsubstr\fR but uses \fIcharacter\fR indexes rather than +\fIbyte\fR indexes, and is thus safe for use on multi-byte strings. .TP .B sunrise([dq_date]) Returns a \fBTIME\fR indicating the time of sunrise on the specified diff --git a/src/err.h b/src/err.h index 861b4e64..48b3b691 100644 --- a/src/err.h +++ b/src/err.h @@ -134,6 +134,8 @@ #define E_MAX_OVERDUE_WITHOUT_TODO 110 #define E_TZ_SPECIFIED_TWICE 111 #define E_TZ_NO_AT 112 +#define E_NO_MB 113 +#define E_BAD_MB_SEQ 114 #ifdef MK_GLOBALS #undef EXTERN @@ -265,6 +267,8 @@ EXTERN char *ErrMsg[] /* E_MAX_OVERDUE_WITHOUT_TODO */ "MAX-OVERDUE specified without TODO", /* E_TZ_SPECIFIED_TWICE */ "TZ specified twice", /* E_TZ_NO_AT */ "TZ specified for non-timed reminder", +/* E_NO_MB */ "C library does not support multibyte characters", +/* E_BAD_MB_SEQ */ "Invalid multibyte sequence", } #endif /* MK_GLOBALS */ ; diff --git a/src/funcs.c b/src/funcs.c index 4ccc3cf7..3729b99f 100644 --- a/src/funcs.c +++ b/src/funcs.c @@ -126,6 +126,9 @@ static int FLanguage (func_info *); static int FLocalToUTC (func_info *); static int FLower (func_info *); static int FMax (func_info *); +static int FMbindex (func_info *); +static int FMbstrlen (func_info *); +static int FMbsubstr (func_info *); static int FMin (func_info *); static int FMinsfromutc (func_info *); static int FMinute (func_info *); @@ -305,6 +308,9 @@ BuiltinFunc Func[] = { { "localtoutc", 1, 1, 1, FLocalToUTC, NULL }, { "lower", 1, 1, 1, FLower, NULL }, { "max", 1, NO_MAX, 1, FMax, NULL }, + { "mbindex", 2, 3, 1, FMbindex, NULL }, + { "mbstrlen", 1, 1, 1, FMbstrlen, NULL }, + { "mbsubstr", 2, 3, 1, FMbsubstr, NULL }, { "min", 1, NO_MAX, 1, FMin, NULL }, { "minsfromutc", 0, 2, 0, FMinsfromutc, NULL }, { "minute", 1, 1, 1, FMinute, NULL }, @@ -476,6 +482,29 @@ static int FStrlen(func_info *info) return OK; } +/***************************************************************/ +/* */ +/* FMBstrlen - string length in wide characters */ +/* */ +/***************************************************************/ +static int FMbstrlen(func_info *info) +{ +#ifdef REM_USE_WCHAR + ASSERT_TYPE(0, STR_TYPE); + RetVal.type = INT_TYPE; + size_t l = mbstowcs(NULL, ARGSTR(0), 0); + if (l == (size_t) -1) { + return E_BAD_MB_SEQ; + } + if (l > INT_MAX) return E_2HIGH; + RETVAL = (int) l; + return OK; +#else + RetVal.type = ERR_TYPE; + return E_NO_MB; +#endif +} + /***************************************************************/ /* */ /* FBaseyr - system base year */ @@ -2378,6 +2407,76 @@ static int FSubstr(func_info *info) return RetStrVal(t, info); } +/***************************************************************/ +/* */ +/* FMbubstr */ +/* */ +/* The mbsubstr function. */ +/* */ +/***************************************************************/ +static int FMbsubstr(func_info *info) +{ +#ifdef REM_USE_WCHAR + wchar_t *str; + wchar_t *s; + wchar_t const *t; + size_t mblen; + char *converted; + size_t len; + int start; + int end; + + if (ARG(0).type != STR_TYPE || ARG(1).type != INT_TYPE) return E_BAD_TYPE; + if (Nargs == 3 && ARG(2).type != INT_TYPE) return E_BAD_TYPE; + + mblen = mbstowcs(NULL, ARGSTR(0), 0); + if (mblen == (size_t) -1) { + return E_BAD_MB_SEQ; + } + str = calloc(mblen+1, sizeof(wchar_t)); + if (!str) { + return E_NO_MEM; + } + (void) mbstowcs(str, ARGSTR(0), mblen+1); + s = str; + start = 1; + while (start < ARGV(1)) { + if (!*s) break; + s++; + start++; + } + t = s; + if (Nargs >= 3) { + end = start; + while (end <= ARGV(2)) { + if (!*s) break; + s++; + end++; + } + *s = (wchar_t) 0; + } + + len = wcstombs(NULL, t, 0); + if (len == (size_t) -1) { + free( (void *) str); + return E_BAD_MB_SEQ; + } + converted = malloc(len+1); + if (!converted) { + free( (void *) str); + return E_NO_MEM; + } + (void) wcstombs(converted, t, len+1); + RetVal.type = STR_TYPE; + RetVal.v.str = converted; + free( (void *) str); + return OK; +#else + RetVal.type = ERR_TYPE; + return E_NO_MB; +#endif +} + /***************************************************************/ /* */ /* FIndex */ @@ -2416,6 +2515,73 @@ static int FIndex(func_info *info) return OK; } +/***************************************************************/ +/* */ +/* FMbindex */ +/* */ +/* The wide-char of one string embedded in another. */ +/* */ +/***************************************************************/ +static int FMbindex(func_info *info) +{ +#ifdef REM_USE_WCHAR + wchar_t *haystack; + wchar_t *needle; + wchar_t const *s; + size_t haylen, needlelen; + + if (ARG(0).type != STR_TYPE || ARG(1).type != STR_TYPE || + (Nargs == 3 && ARG(2).type != INT_TYPE)) return E_BAD_TYPE; + + haylen = mbstowcs(NULL, ARGSTR(0), INT_MAX); + if (haylen == (size_t) -1) { + return E_BAD_MB_SEQ; + } + haystack = calloc(haylen+1, sizeof(wchar_t)); + if (!haystack) { + return E_NO_MEM; + } + (void) mbstowcs(haystack, ARGSTR(0), haylen+1); + needlelen = mbstowcs(NULL, ARGSTR(1), INT_MAX); + if (needlelen == (size_t) -1) { + return E_BAD_MB_SEQ; + } + needle = calloc(needlelen+1, sizeof(wchar_t)); + if (!needle) { + free( (void *) haystack); + return E_NO_MEM; + } + (void) mbstowcs(needle, ARGSTR(1), needlelen+1); + s = haystack; + +/* If 3 args, bump up the start */ + if (Nargs == 3) { + if (ARGV(2) > (int) haylen) { + s += haylen; + } else { + s += ARGV(2) - 1; + } + } + +/* Find the string */ + RetVal.type = INT_TYPE; + s = wcsstr(s, needle); + if (!s) { + free( (void *) haystack); + free( (void *) needle); + RETVAL = 0; + return OK; + } + RETVAL = s - haystack + 1; + free( (void *) haystack); + free( (void *) needle); + return OK; +#else + RetVal.type = ERR_TYPE; + return E_NO_MB; +#endif +} + /***************************************************************/ /* */ /* FIif */ diff --git a/tests/test-rem b/tests/test-rem index f285781b..9b9953a8 100644 --- a/tests/test-rem +++ b/tests/test-rem @@ -59,7 +59,7 @@ chmod 000 include_dir/04cantread.rem TEST_GETENV="foo bar baz" ; export TEST_GETENV echo "Test 1" > ../tests/test.out echo "" >> ../tests/test.out -../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v 'TimetIs64bit' >> ../tests/test.out +../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v -a 'TimetIs64bit' >> ../tests/test.out 2>&1 echo "" >> ../tests/test.out echo "Test 2" >> ../tests/test.out echo "" >> ../tests/test.out @@ -618,7 +618,7 @@ rm -f ../tests/once.timestamp ../src/remind --flush -q ../tests/dedupe.rem 8 November 2023 >> ../tests/test.out 2>&1 # Remove references to SysInclude, which is build-specific -grep -F -v '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out +grep -F -v -a '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out # If "man" accepts the --warnings flag, test all the man pages. RUNMAN=0 @@ -785,7 +785,7 @@ echo "... and here is stderr" >> ../tests/test.out 2>&1 # Test %: substitution sequence in all the languages for i in ../include/lang/??.rem ; do - ../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep 2025/ >> ../tests/test.out + ../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep -a 2025/ >> ../tests/test.out DO [i] REM TODO 2025-08-13 MSG %(LANGID) Task1%: REM TODO 2025-08-13 COMPLETE-THROUGH 2025-08-12 MSG %(LANGID) Task2%: diff --git a/tests/test.cmp b/tests/test.cmp index fa446b73..71b19e8b 100644 --- a/tests/test.cmp +++ b/tests/test.cmp @@ -16592,8 +16592,43 @@ Leaving UserFN c() => 33 DEBUG -xe Overridden: subst_colon subst_bang subst_question subst_at subst_hash +bad => "ÿ" +mbstrlen("ÿ") => Invalid multibyte sequence +../tests/test.rem(1734): mbstrlen(): Invalid multibyte sequence +bad => "ÿ" +strlen("ÿ") => 1 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +mbstrlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 15 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +strlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 34 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 9 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 24 +bad => "ÿ" +bad => "ÿ" +mbindex("ÿ", "ÿ") => Invalid multibyte sequence +../tests/test.rem(1742): mbindex(): Invalid multibyte sequence +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 11) => 11 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 25) => 26 +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "🙂🙂🙂🙂xyzçççéfoo" +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "🙂🙂🙂🙂xyzç" +bad => "ÿ" +mbsubstr("ÿ", 1) => Invalid multibyte sequence +../tests/test.rem(1749): mbsubstr(): Invalid multibyte sequence +bad => "ÿ" +mbsubstr("ÿ", 1, 20) => Invalid multibyte sequence +../tests/test.rem(1750): mbsubstr(): Invalid multibyte sequence +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "Ÿ™‚🙂🙂🙂🙂xyzçççéfoo" +faces => "🙂🙂🙂🙂🙂xyzçççéfoo" +substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "Ÿ™‚🙂ð" Variable hash table statistics: - Entries: 100144; Buckets: 87719; Non-empty Buckets: 66301 + Entries: 100146; Buckets: 87719; Non-empty Buckets: 66303 Maxlen: 5; Minlen: 0; Avglen: 1.142; Stddev: 0.878; Avg nonempty len: 1.510 Growths: 13; Shrinks: 0 Function hash table statistics: @@ -24589,6 +24624,9 @@ language localtoutc lower max +mbindex +mbstrlen +mbsubstr min minsfromutc minute @@ -24993,6 +25031,8 @@ TRANSLATE "MAX-OVERDUE specified twice" "" TRANSLATE "MAX-OVERDUE specified without TODO" "" TRANSLATE "TZ specified twice" "" TRANSLATE "TZ specified for non-timed reminder" "" +TRANSLATE "C library does not support multibyte characters" "" +TRANSLATE "Invalid multibyte sequence" "" # Other Messages TRANSLATE "%s function `%s' defined at %s(%s) does not use its argument" "" diff --git a/tests/test.rem b/tests/test.rem index 46971278..c6e39e65 100644 --- a/tests/test.rem +++ b/tests/test.rem @@ -1725,7 +1725,34 @@ fset subst_hash(a, b, c) "subst_hash" REM MSG Overridden: %: %! %? %@ %# +# mbstrlen and friends +DEBUG -xe +set bad char(255) +set faces "🙂" * 5 + "xyz" + "çççéfoo" +DEBUG +x +set a mbstrlen(bad) +set a strlen(bad) + +set a mbstrlen(faces) +set a strlen(faces) + +set a mbindex(faces, "ç") +set a index(faces, "ç") +set a mbindex(bad, bad) + +set a mbindex(faces, "ç", 11) +set a index(faces, "ç", 25) + +set a mbsubstr(faces, 2) +set a mbsubstr(faces, 2, 9) +set a mbsubstr(bad, 1) +set a mbsubstr(bad, 1, 20) + +set a substr(faces, 2) +set a substr(faces, 2, 9) + +DEBUG -x # Don't want Remind to queue reminders EXIT