Add better support for multi-byte character set.

The functions mbindex, mbstrlen and mbsubstr are just like
index, strlen and substr, but they use and return 1-based
character indexes rather than 1-based byte indexes.
This commit is contained in:
Dianne Skoll
2025-09-10 13:25:27 -04:00
parent 2c9087aa79
commit 375576fcc5
7 changed files with 292 additions and 15 deletions
+2 -1
View File
@@ -171,7 +171,8 @@
"dusk" "easterdate" "escape" "eval" "evaltrig" "filedate" "filedatetime"
"filedir" "filename" "getenv" "hebdate" "hebday" "hebmon" "hebyear"
"hour" "htmlescape" "htmlstriptags" "iif" "index" "isany" "isconst" "isdst"
"isleap" "isomitted" "language" "localtoutc" "lower" "max" "min"
"isleap" "isomitted" "language" "localtoutc" "lower" "max"
"mbindex" "mbstrlen" "mbsubstr" "min"
"minsfromutc" "minute" "mon" "monnum" "moondate" "moondatetime"
"moonphase" "moonrise" "moonrisedir" "moonset" "moonsetdir" "moontime"
"multitrig" "ndawn" "ndusk" "nonconst" "nonomitted" "now" "ord" "orthodoxeaster"
+49 -10
View File
@@ -2540,6 +2540,26 @@ word. The \fBINT\fR data type corresponds to the C "int" type.
The \fBSTRING\fR data type consists of strings of characters. It is
somewhat comparable to a C character array, but more closely resembles
the string type in BASIC.
.RS
.PP
\fBRemind\fR normally expects to be running in a UTF-8 environment.
In this environment, there is a difference between \fIbytes\fR and
\fIcharacters\fR since in UTF-8, a character may be represented by
a sequence of more than one byte. For example, in a UTF-8 environment,
the string "🙂" contains one character but four bytes. And the string
"één" contains three characters but five bytes.
.PP
\fBRemind\fR has a set of functions
that work on \fIbytes\fR, namely \fBindex\fR, \fBstrlen\fR and \fBsubstr\fR.
These are not safe to use on multi-byte strings; instead use
\fBmbindex\fR, \fBmbstrlen\fR and \fBmbsubstr\fR. If you know \fIfor sure\fR
that a string contains only single-byte characters, then the byte-oriented
versions may be used and are faster than the multi-byte versions.
.PP
Some ancient or embedded systems may lack the C library functions needed
to deal with multi-byte strings. In that case, the \fBmb\fIxxx\fR functions
all return an error.
.RE
.TP
.B TIME
The \fBTIME\fR data type is used for two different purposes: To represent
@@ -4031,14 +4051,23 @@ compatible with previous versions of \fBRemind\fR.
.TP
.B index(s_search, s_target [,i_start)
Returns an \fBINT\fR that is the location of \fItarget\fR in the
string \fIsearch\fR. The first character of a string is numbered 1.
If \fItarget\fR does not exist in \fIsearch\fR, then 0 is returned.
string \fIsearch\fR. Note that \fBindex\fR uses \fIbyte\fR positions,
not character positions, so should not be used on non-ASCII strings. Use
\fBmbindex\fR for non-ASCII strings.
.PP
The first byte of a string is numbered 1. If \fItarget\fR does not
exist in \fIsearch\fR, then 0 is returned.
.RS
.PP
The optional parameter \fIstart\fR specifies the position in
\fIsearch\fR at which to start looking for \fItarget\fR.
.RE
.TP
.B mbindex(s_search, s_target [,i_start])
Similar to \fBindex()\fR but returns the \fIcharacter\fR position rather
than the \fIbyte\fR position. Also, \fIstart\fR is interpreted as a
1-based character index rather than a byte index.
.TP
.B isany(arg1 [,arg2, ..., argN]);
Returns 1 if the first argument \fIarg1\fR is equal to any of the
subsequent arguments \fIarg2\fR through \fIargN\fR; returns 0 otherwise.
@@ -4642,17 +4671,27 @@ output is not going to a TTY.
.RE
.TP
.B strlen(s_str)
Returns the length of \fIstr\fR. If the length of \fIstr\fR is too large
to represent as an integer, emits a "Number too high" error. Note that
\fBstrlen\fR returns the number of \fIbytes\fR in the string, not the
number of \fIcharacters\fR. These numbers are the same for ASCII strings,
but may be different for UTF-8 strings.
Returns the length of \fIstr\fR in bytes. If the length of \fIstr\fR
is too large to represent as an integer, emits a "Number too high"
error. Note that \fBstrlen\fR returns the number of \fIbytes\fR in
the string, not the number of \fIcharacters\fR. These numbers are the
same for ASCII strings, but may be different for UTF-8 strings.
.TP
.B mbstrlen(str)
Similar to \fBstrlen\fR, but returns the length of the string in
\fIcharacters\fR rather than \fIbytes\fR and is thus safe for use
on multi-byte strings.
.TP
.B substr(s_str, i_start [,i_end])
Returns a \fBSTRING\fR consisting of all characters in \fIstr\fR from
\fIstart\fR up to and including \fIend\fR. Characters are numbered
Returns a \fBSTRING\fR consisting of all bytes in \fIstr\fR from
\fIstart\fR up to and including \fIend\fR. Bytes are numbered
from 1. If \fIend\fR is not supplied, then it defaults to the length
of \fIstr\fR.
of \fIstr\fR. Because \fBsubstr\fR uses \fIbyte\fR indexes rather than
\fIcharacter\fR indexes, it should not be used on multi-byte strings.
.TP
.B mbsubstr(s_str, i_start [,i_end])
Similar to \fBsubstr\fR but uses \fIcharacter\fR indexes rather than
\fIbyte\fR indexes, and is thus safe for use on multi-byte strings.
.TP
.B sunrise([dq_date])
Returns a \fBTIME\fR indicating the time of sunrise on the specified
+4
View File
@@ -134,6 +134,8 @@
#define E_MAX_OVERDUE_WITHOUT_TODO 110
#define E_TZ_SPECIFIED_TWICE 111
#define E_TZ_NO_AT 112
#define E_NO_MB 113
#define E_BAD_MB_SEQ 114
#ifdef MK_GLOBALS
#undef EXTERN
@@ -265,6 +267,8 @@ EXTERN char *ErrMsg[]
/* E_MAX_OVERDUE_WITHOUT_TODO */ "MAX-OVERDUE specified without TODO",
/* E_TZ_SPECIFIED_TWICE */ "TZ specified twice",
/* E_TZ_NO_AT */ "TZ specified for non-timed reminder",
/* E_NO_MB */ "C library does not support multibyte characters",
/* E_BAD_MB_SEQ */ "Invalid multibyte sequence",
}
#endif /* MK_GLOBALS */
;
+166
View File
@@ -126,6 +126,9 @@ static int FLanguage (func_info *);
static int FLocalToUTC (func_info *);
static int FLower (func_info *);
static int FMax (func_info *);
static int FMbindex (func_info *);
static int FMbstrlen (func_info *);
static int FMbsubstr (func_info *);
static int FMin (func_info *);
static int FMinsfromutc (func_info *);
static int FMinute (func_info *);
@@ -305,6 +308,9 @@ BuiltinFunc Func[] = {
{ "localtoutc", 1, 1, 1, FLocalToUTC, NULL },
{ "lower", 1, 1, 1, FLower, NULL },
{ "max", 1, NO_MAX, 1, FMax, NULL },
{ "mbindex", 2, 3, 1, FMbindex, NULL },
{ "mbstrlen", 1, 1, 1, FMbstrlen, NULL },
{ "mbsubstr", 2, 3, 1, FMbsubstr, NULL },
{ "min", 1, NO_MAX, 1, FMin, NULL },
{ "minsfromutc", 0, 2, 0, FMinsfromutc, NULL },
{ "minute", 1, 1, 1, FMinute, NULL },
@@ -476,6 +482,29 @@ static int FStrlen(func_info *info)
return OK;
}
/***************************************************************/
/* */
/* FMBstrlen - string length in wide characters */
/* */
/***************************************************************/
static int FMbstrlen(func_info *info)
{
#ifdef REM_USE_WCHAR
ASSERT_TYPE(0, STR_TYPE);
RetVal.type = INT_TYPE;
size_t l = mbstowcs(NULL, ARGSTR(0), 0);
if (l == (size_t) -1) {
return E_BAD_MB_SEQ;
}
if (l > INT_MAX) return E_2HIGH;
RETVAL = (int) l;
return OK;
#else
RetVal.type = ERR_TYPE;
return E_NO_MB;
#endif
}
/***************************************************************/
/* */
/* FBaseyr - system base year */
@@ -2378,6 +2407,76 @@ static int FSubstr(func_info *info)
return RetStrVal(t, info);
}
/***************************************************************/
/* */
/* FMbubstr */
/* */
/* The mbsubstr function. */
/* */
/***************************************************************/
static int FMbsubstr(func_info *info)
{
#ifdef REM_USE_WCHAR
wchar_t *str;
wchar_t *s;
wchar_t const *t;
size_t mblen;
char *converted;
size_t len;
int start;
int end;
if (ARG(0).type != STR_TYPE || ARG(1).type != INT_TYPE) return E_BAD_TYPE;
if (Nargs == 3 && ARG(2).type != INT_TYPE) return E_BAD_TYPE;
mblen = mbstowcs(NULL, ARGSTR(0), 0);
if (mblen == (size_t) -1) {
return E_BAD_MB_SEQ;
}
str = calloc(mblen+1, sizeof(wchar_t));
if (!str) {
return E_NO_MEM;
}
(void) mbstowcs(str, ARGSTR(0), mblen+1);
s = str;
start = 1;
while (start < ARGV(1)) {
if (!*s) break;
s++;
start++;
}
t = s;
if (Nargs >= 3) {
end = start;
while (end <= ARGV(2)) {
if (!*s) break;
s++;
end++;
}
*s = (wchar_t) 0;
}
len = wcstombs(NULL, t, 0);
if (len == (size_t) -1) {
free( (void *) str);
return E_BAD_MB_SEQ;
}
converted = malloc(len+1);
if (!converted) {
free( (void *) str);
return E_NO_MEM;
}
(void) wcstombs(converted, t, len+1);
RetVal.type = STR_TYPE;
RetVal.v.str = converted;
free( (void *) str);
return OK;
#else
RetVal.type = ERR_TYPE;
return E_NO_MB;
#endif
}
/***************************************************************/
/* */
/* FIndex */
@@ -2416,6 +2515,73 @@ static int FIndex(func_info *info)
return OK;
}
/***************************************************************/
/* */
/* FMbindex */
/* */
/* The wide-char of one string embedded in another. */
/* */
/***************************************************************/
static int FMbindex(func_info *info)
{
#ifdef REM_USE_WCHAR
wchar_t *haystack;
wchar_t *needle;
wchar_t const *s;
size_t haylen, needlelen;
if (ARG(0).type != STR_TYPE || ARG(1).type != STR_TYPE ||
(Nargs == 3 && ARG(2).type != INT_TYPE)) return E_BAD_TYPE;
haylen = mbstowcs(NULL, ARGSTR(0), INT_MAX);
if (haylen == (size_t) -1) {
return E_BAD_MB_SEQ;
}
haystack = calloc(haylen+1, sizeof(wchar_t));
if (!haystack) {
return E_NO_MEM;
}
(void) mbstowcs(haystack, ARGSTR(0), haylen+1);
needlelen = mbstowcs(NULL, ARGSTR(1), INT_MAX);
if (needlelen == (size_t) -1) {
return E_BAD_MB_SEQ;
}
needle = calloc(needlelen+1, sizeof(wchar_t));
if (!needle) {
free( (void *) haystack);
return E_NO_MEM;
}
(void) mbstowcs(needle, ARGSTR(1), needlelen+1);
s = haystack;
/* If 3 args, bump up the start */
if (Nargs == 3) {
if (ARGV(2) > (int) haylen) {
s += haylen;
} else {
s += ARGV(2) - 1;
}
}
/* Find the string */
RetVal.type = INT_TYPE;
s = wcsstr(s, needle);
if (!s) {
free( (void *) haystack);
free( (void *) needle);
RETVAL = 0;
return OK;
}
RETVAL = s - haystack + 1;
free( (void *) haystack);
free( (void *) needle);
return OK;
#else
RetVal.type = ERR_TYPE;
return E_NO_MB;
#endif
}
/***************************************************************/
/* */
/* FIif */
+3 -3
View File
@@ -59,7 +59,7 @@ chmod 000 include_dir/04cantread.rem
TEST_GETENV="foo bar baz" ; export TEST_GETENV
echo "Test 1" > ../tests/test.out
echo "" >> ../tests/test.out
../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v 'TimetIs64bit' >> ../tests/test.out
../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v -a 'TimetIs64bit' >> ../tests/test.out 2>&1
echo "" >> ../tests/test.out
echo "Test 2" >> ../tests/test.out
echo "" >> ../tests/test.out
@@ -618,7 +618,7 @@ rm -f ../tests/once.timestamp
../src/remind --flush -q ../tests/dedupe.rem 8 November 2023 >> ../tests/test.out 2>&1
# Remove references to SysInclude, which is build-specific
grep -F -v '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
grep -F -v -a '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
# If "man" accepts the --warnings flag, test all the man pages.
RUNMAN=0
@@ -785,7 +785,7 @@ echo "... and here is stderr" >> ../tests/test.out 2>&1
# Test %: substitution sequence in all the languages
for i in ../include/lang/??.rem ; do
../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep 2025/ >> ../tests/test.out
../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep -a 2025/ >> ../tests/test.out
DO [i]
REM TODO 2025-08-13 MSG %(LANGID) Task1%:
REM TODO 2025-08-13 COMPLETE-THROUGH 2025-08-12 MSG %(LANGID) Task2%:
+41 -1
View File
@@ -16592,8 +16592,43 @@ Leaving UserFN c() => 33
DEBUG -xe
Overridden: subst_colon subst_bang subst_question subst_at subst_hash
bad => "ÿ"
mbstrlen("ÿ") => Invalid multibyte sequence
../tests/test.rem(1734): mbstrlen(): Invalid multibyte sequence
bad => "ÿ"
strlen("ÿ") => 1
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
mbstrlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 15
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
strlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 34
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 9
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 24
bad => "ÿ"
bad => "ÿ"
mbindex("ÿ", "ÿ") => Invalid multibyte sequence
../tests/test.rem(1742): mbindex(): Invalid multibyte sequence
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 11) => 11
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 25) => 26
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "🙂🙂🙂🙂xyzçççéfoo"
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "🙂🙂🙂🙂xyzç"
bad => "ÿ"
mbsubstr("ÿ", 1) => Invalid multibyte sequence
../tests/test.rem(1749): mbsubstr(): Invalid multibyte sequence
bad => "ÿ"
mbsubstr("ÿ", 1, 20) => Invalid multibyte sequence
../tests/test.rem(1750): mbsubstr(): Invalid multibyte sequence
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "Ÿ™‚🙂🙂🙂🙂xyzçççéfoo"
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "Ÿ™‚🙂ð"
Variable hash table statistics:
Entries: 100144; Buckets: 87719; Non-empty Buckets: 66301
Entries: 100146; Buckets: 87719; Non-empty Buckets: 66303
Maxlen: 5; Minlen: 0; Avglen: 1.142; Stddev: 0.878; Avg nonempty len: 1.510
Growths: 13; Shrinks: 0
Function hash table statistics:
@@ -24589,6 +24624,9 @@ language
localtoutc
lower
max
mbindex
mbstrlen
mbsubstr
min
minsfromutc
minute
@@ -24993,6 +25031,8 @@ TRANSLATE "MAX-OVERDUE specified twice" ""
TRANSLATE "MAX-OVERDUE specified without TODO" ""
TRANSLATE "TZ specified twice" ""
TRANSLATE "TZ specified for non-timed reminder" ""
TRANSLATE "C library does not support multibyte characters" ""
TRANSLATE "Invalid multibyte sequence" ""
# Other Messages
TRANSLATE "%s function `%s' defined at %s(%s) does not use its argument" ""
+27
View File
@@ -1725,7 +1725,34 @@ fset subst_hash(a, b, c) "subst_hash"
REM MSG Overridden: %: %! %? %@ %#
# mbstrlen and friends
DEBUG -xe
set bad char(255)
set faces "🙂" * 5 + "xyz" + "çççéfoo"
DEBUG +x
set a mbstrlen(bad)
set a strlen(bad)
set a mbstrlen(faces)
set a strlen(faces)
set a mbindex(faces, "ç")
set a index(faces, "ç")
set a mbindex(bad, bad)
set a mbindex(faces, "ç", 11)
set a index(faces, "ç", 25)
set a mbsubstr(faces, 2)
set a mbsubstr(faces, 2, 9)
set a mbsubstr(bad, 1)
set a mbsubstr(bad, 1, 20)
set a substr(faces, 2)
set a substr(faces, 2, 9)
DEBUG -x
# Don't want Remind to queue reminders
EXIT