mirror of
https://salsa.debian.org/dskoll/remind.git
synced 2026-04-23 17:53:03 +02:00
Add better support for multi-byte character set.
The functions mbindex, mbstrlen and mbsubstr are just like index, strlen and substr, but they use and return 1-based character indexes rather than 1-based byte indexes.
This commit is contained in:
@@ -171,7 +171,8 @@
|
||||
"dusk" "easterdate" "escape" "eval" "evaltrig" "filedate" "filedatetime"
|
||||
"filedir" "filename" "getenv" "hebdate" "hebday" "hebmon" "hebyear"
|
||||
"hour" "htmlescape" "htmlstriptags" "iif" "index" "isany" "isconst" "isdst"
|
||||
"isleap" "isomitted" "language" "localtoutc" "lower" "max" "min"
|
||||
"isleap" "isomitted" "language" "localtoutc" "lower" "max"
|
||||
"mbindex" "mbstrlen" "mbsubstr" "min"
|
||||
"minsfromutc" "minute" "mon" "monnum" "moondate" "moondatetime"
|
||||
"moonphase" "moonrise" "moonrisedir" "moonset" "moonsetdir" "moontime"
|
||||
"multitrig" "ndawn" "ndusk" "nonconst" "nonomitted" "now" "ord" "orthodoxeaster"
|
||||
|
||||
+49
-10
@@ -2540,6 +2540,26 @@ word. The \fBINT\fR data type corresponds to the C "int" type.
|
||||
The \fBSTRING\fR data type consists of strings of characters. It is
|
||||
somewhat comparable to a C character array, but more closely resembles
|
||||
the string type in BASIC.
|
||||
.RS
|
||||
.PP
|
||||
\fBRemind\fR normally expects to be running in a UTF-8 environment.
|
||||
In this environment, there is a difference between \fIbytes\fR and
|
||||
\fIcharacters\fR since in UTF-8, a character may be represented by
|
||||
a sequence of more than one byte. For example, in a UTF-8 environment,
|
||||
the string "🙂" contains one character but four bytes. And the string
|
||||
"één" contains three characters but five bytes.
|
||||
.PP
|
||||
\fBRemind\fR has a set of functions
|
||||
that work on \fIbytes\fR, namely \fBindex\fR, \fBstrlen\fR and \fBsubstr\fR.
|
||||
These are not safe to use on multi-byte strings; instead use
|
||||
\fBmbindex\fR, \fBmbstrlen\fR and \fBmbsubstr\fR. If you know \fIfor sure\fR
|
||||
that a string contains only single-byte characters, then the byte-oriented
|
||||
versions may be used and are faster than the multi-byte versions.
|
||||
.PP
|
||||
Some ancient or embedded systems may lack the C library functions needed
|
||||
to deal with multi-byte strings. In that case, the \fBmb\fIxxx\fR functions
|
||||
all return an error.
|
||||
.RE
|
||||
.TP
|
||||
.B TIME
|
||||
The \fBTIME\fR data type is used for two different purposes: To represent
|
||||
@@ -4031,14 +4051,23 @@ compatible with previous versions of \fBRemind\fR.
|
||||
.TP
|
||||
.B index(s_search, s_target [,i_start)
|
||||
Returns an \fBINT\fR that is the location of \fItarget\fR in the
|
||||
string \fIsearch\fR. The first character of a string is numbered 1.
|
||||
If \fItarget\fR does not exist in \fIsearch\fR, then 0 is returned.
|
||||
string \fIsearch\fR. Note that \fBindex\fR uses \fIbyte\fR positions,
|
||||
not character positions, so should not be used on non-ASCII strings. Use
|
||||
\fBmbindex\fR for non-ASCII strings.
|
||||
.PP
|
||||
The first byte of a string is numbered 1. If \fItarget\fR does not
|
||||
exist in \fIsearch\fR, then 0 is returned.
|
||||
.RS
|
||||
.PP
|
||||
The optional parameter \fIstart\fR specifies the position in
|
||||
\fIsearch\fR at which to start looking for \fItarget\fR.
|
||||
.RE
|
||||
.TP
|
||||
.B mbindex(s_search, s_target [,i_start])
|
||||
Similar to \fBindex()\fR but returns the \fIcharacter\fR position rather
|
||||
than the \fIbyte\fR position. Also, \fIstart\fR is interpreted as a
|
||||
1-based character index rather than a byte index.
|
||||
.TP
|
||||
.B isany(arg1 [,arg2, ..., argN]);
|
||||
Returns 1 if the first argument \fIarg1\fR is equal to any of the
|
||||
subsequent arguments \fIarg2\fR through \fIargN\fR; returns 0 otherwise.
|
||||
@@ -4642,17 +4671,27 @@ output is not going to a TTY.
|
||||
.RE
|
||||
.TP
|
||||
.B strlen(s_str)
|
||||
Returns the length of \fIstr\fR. If the length of \fIstr\fR is too large
|
||||
to represent as an integer, emits a "Number too high" error. Note that
|
||||
\fBstrlen\fR returns the number of \fIbytes\fR in the string, not the
|
||||
number of \fIcharacters\fR. These numbers are the same for ASCII strings,
|
||||
but may be different for UTF-8 strings.
|
||||
Returns the length of \fIstr\fR in bytes. If the length of \fIstr\fR
|
||||
is too large to represent as an integer, emits a "Number too high"
|
||||
error. Note that \fBstrlen\fR returns the number of \fIbytes\fR in
|
||||
the string, not the number of \fIcharacters\fR. These numbers are the
|
||||
same for ASCII strings, but may be different for UTF-8 strings.
|
||||
.TP
|
||||
.B mbstrlen(str)
|
||||
Similar to \fBstrlen\fR, but returns the length of the string in
|
||||
\fIcharacters\fR rather than \fIbytes\fR and is thus safe for use
|
||||
on multi-byte strings.
|
||||
.TP
|
||||
.B substr(s_str, i_start [,i_end])
|
||||
Returns a \fBSTRING\fR consisting of all characters in \fIstr\fR from
|
||||
\fIstart\fR up to and including \fIend\fR. Characters are numbered
|
||||
Returns a \fBSTRING\fR consisting of all bytes in \fIstr\fR from
|
||||
\fIstart\fR up to and including \fIend\fR. Bytes are numbered
|
||||
from 1. If \fIend\fR is not supplied, then it defaults to the length
|
||||
of \fIstr\fR.
|
||||
of \fIstr\fR. Because \fBsubstr\fR uses \fIbyte\fR indexes rather than
|
||||
\fIcharacter\fR indexes, it should not be used on multi-byte strings.
|
||||
.TP
|
||||
.B mbsubstr(s_str, i_start [,i_end])
|
||||
Similar to \fBsubstr\fR but uses \fIcharacter\fR indexes rather than
|
||||
\fIbyte\fR indexes, and is thus safe for use on multi-byte strings.
|
||||
.TP
|
||||
.B sunrise([dq_date])
|
||||
Returns a \fBTIME\fR indicating the time of sunrise on the specified
|
||||
|
||||
@@ -134,6 +134,8 @@
|
||||
#define E_MAX_OVERDUE_WITHOUT_TODO 110
|
||||
#define E_TZ_SPECIFIED_TWICE 111
|
||||
#define E_TZ_NO_AT 112
|
||||
#define E_NO_MB 113
|
||||
#define E_BAD_MB_SEQ 114
|
||||
|
||||
#ifdef MK_GLOBALS
|
||||
#undef EXTERN
|
||||
@@ -265,6 +267,8 @@ EXTERN char *ErrMsg[]
|
||||
/* E_MAX_OVERDUE_WITHOUT_TODO */ "MAX-OVERDUE specified without TODO",
|
||||
/* E_TZ_SPECIFIED_TWICE */ "TZ specified twice",
|
||||
/* E_TZ_NO_AT */ "TZ specified for non-timed reminder",
|
||||
/* E_NO_MB */ "C library does not support multibyte characters",
|
||||
/* E_BAD_MB_SEQ */ "Invalid multibyte sequence",
|
||||
}
|
||||
#endif /* MK_GLOBALS */
|
||||
;
|
||||
|
||||
+166
@@ -126,6 +126,9 @@ static int FLanguage (func_info *);
|
||||
static int FLocalToUTC (func_info *);
|
||||
static int FLower (func_info *);
|
||||
static int FMax (func_info *);
|
||||
static int FMbindex (func_info *);
|
||||
static int FMbstrlen (func_info *);
|
||||
static int FMbsubstr (func_info *);
|
||||
static int FMin (func_info *);
|
||||
static int FMinsfromutc (func_info *);
|
||||
static int FMinute (func_info *);
|
||||
@@ -305,6 +308,9 @@ BuiltinFunc Func[] = {
|
||||
{ "localtoutc", 1, 1, 1, FLocalToUTC, NULL },
|
||||
{ "lower", 1, 1, 1, FLower, NULL },
|
||||
{ "max", 1, NO_MAX, 1, FMax, NULL },
|
||||
{ "mbindex", 2, 3, 1, FMbindex, NULL },
|
||||
{ "mbstrlen", 1, 1, 1, FMbstrlen, NULL },
|
||||
{ "mbsubstr", 2, 3, 1, FMbsubstr, NULL },
|
||||
{ "min", 1, NO_MAX, 1, FMin, NULL },
|
||||
{ "minsfromutc", 0, 2, 0, FMinsfromutc, NULL },
|
||||
{ "minute", 1, 1, 1, FMinute, NULL },
|
||||
@@ -476,6 +482,29 @@ static int FStrlen(func_info *info)
|
||||
return OK;
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FMBstrlen - string length in wide characters */
|
||||
/* */
|
||||
/***************************************************************/
|
||||
static int FMbstrlen(func_info *info)
|
||||
{
|
||||
#ifdef REM_USE_WCHAR
|
||||
ASSERT_TYPE(0, STR_TYPE);
|
||||
RetVal.type = INT_TYPE;
|
||||
size_t l = mbstowcs(NULL, ARGSTR(0), 0);
|
||||
if (l == (size_t) -1) {
|
||||
return E_BAD_MB_SEQ;
|
||||
}
|
||||
if (l > INT_MAX) return E_2HIGH;
|
||||
RETVAL = (int) l;
|
||||
return OK;
|
||||
#else
|
||||
RetVal.type = ERR_TYPE;
|
||||
return E_NO_MB;
|
||||
#endif
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FBaseyr - system base year */
|
||||
@@ -2378,6 +2407,76 @@ static int FSubstr(func_info *info)
|
||||
return RetStrVal(t, info);
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FMbubstr */
|
||||
/* */
|
||||
/* The mbsubstr function. */
|
||||
/* */
|
||||
/***************************************************************/
|
||||
static int FMbsubstr(func_info *info)
|
||||
{
|
||||
#ifdef REM_USE_WCHAR
|
||||
wchar_t *str;
|
||||
wchar_t *s;
|
||||
wchar_t const *t;
|
||||
size_t mblen;
|
||||
char *converted;
|
||||
size_t len;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (ARG(0).type != STR_TYPE || ARG(1).type != INT_TYPE) return E_BAD_TYPE;
|
||||
if (Nargs == 3 && ARG(2).type != INT_TYPE) return E_BAD_TYPE;
|
||||
|
||||
mblen = mbstowcs(NULL, ARGSTR(0), 0);
|
||||
if (mblen == (size_t) -1) {
|
||||
return E_BAD_MB_SEQ;
|
||||
}
|
||||
str = calloc(mblen+1, sizeof(wchar_t));
|
||||
if (!str) {
|
||||
return E_NO_MEM;
|
||||
}
|
||||
(void) mbstowcs(str, ARGSTR(0), mblen+1);
|
||||
s = str;
|
||||
start = 1;
|
||||
while (start < ARGV(1)) {
|
||||
if (!*s) break;
|
||||
s++;
|
||||
start++;
|
||||
}
|
||||
t = s;
|
||||
if (Nargs >= 3) {
|
||||
end = start;
|
||||
while (end <= ARGV(2)) {
|
||||
if (!*s) break;
|
||||
s++;
|
||||
end++;
|
||||
}
|
||||
*s = (wchar_t) 0;
|
||||
}
|
||||
|
||||
len = wcstombs(NULL, t, 0);
|
||||
if (len == (size_t) -1) {
|
||||
free( (void *) str);
|
||||
return E_BAD_MB_SEQ;
|
||||
}
|
||||
converted = malloc(len+1);
|
||||
if (!converted) {
|
||||
free( (void *) str);
|
||||
return E_NO_MEM;
|
||||
}
|
||||
(void) wcstombs(converted, t, len+1);
|
||||
RetVal.type = STR_TYPE;
|
||||
RetVal.v.str = converted;
|
||||
free( (void *) str);
|
||||
return OK;
|
||||
#else
|
||||
RetVal.type = ERR_TYPE;
|
||||
return E_NO_MB;
|
||||
#endif
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FIndex */
|
||||
@@ -2416,6 +2515,73 @@ static int FIndex(func_info *info)
|
||||
return OK;
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FMbindex */
|
||||
/* */
|
||||
/* The wide-char of one string embedded in another. */
|
||||
/* */
|
||||
/***************************************************************/
|
||||
static int FMbindex(func_info *info)
|
||||
{
|
||||
#ifdef REM_USE_WCHAR
|
||||
wchar_t *haystack;
|
||||
wchar_t *needle;
|
||||
wchar_t const *s;
|
||||
size_t haylen, needlelen;
|
||||
|
||||
if (ARG(0).type != STR_TYPE || ARG(1).type != STR_TYPE ||
|
||||
(Nargs == 3 && ARG(2).type != INT_TYPE)) return E_BAD_TYPE;
|
||||
|
||||
haylen = mbstowcs(NULL, ARGSTR(0), INT_MAX);
|
||||
if (haylen == (size_t) -1) {
|
||||
return E_BAD_MB_SEQ;
|
||||
}
|
||||
haystack = calloc(haylen+1, sizeof(wchar_t));
|
||||
if (!haystack) {
|
||||
return E_NO_MEM;
|
||||
}
|
||||
(void) mbstowcs(haystack, ARGSTR(0), haylen+1);
|
||||
needlelen = mbstowcs(NULL, ARGSTR(1), INT_MAX);
|
||||
if (needlelen == (size_t) -1) {
|
||||
return E_BAD_MB_SEQ;
|
||||
}
|
||||
needle = calloc(needlelen+1, sizeof(wchar_t));
|
||||
if (!needle) {
|
||||
free( (void *) haystack);
|
||||
return E_NO_MEM;
|
||||
}
|
||||
(void) mbstowcs(needle, ARGSTR(1), needlelen+1);
|
||||
s = haystack;
|
||||
|
||||
/* If 3 args, bump up the start */
|
||||
if (Nargs == 3) {
|
||||
if (ARGV(2) > (int) haylen) {
|
||||
s += haylen;
|
||||
} else {
|
||||
s += ARGV(2) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find the string */
|
||||
RetVal.type = INT_TYPE;
|
||||
s = wcsstr(s, needle);
|
||||
if (!s) {
|
||||
free( (void *) haystack);
|
||||
free( (void *) needle);
|
||||
RETVAL = 0;
|
||||
return OK;
|
||||
}
|
||||
RETVAL = s - haystack + 1;
|
||||
free( (void *) haystack);
|
||||
free( (void *) needle);
|
||||
return OK;
|
||||
#else
|
||||
RetVal.type = ERR_TYPE;
|
||||
return E_NO_MB;
|
||||
#endif
|
||||
}
|
||||
|
||||
/***************************************************************/
|
||||
/* */
|
||||
/* FIif */
|
||||
|
||||
+3
-3
@@ -59,7 +59,7 @@ chmod 000 include_dir/04cantread.rem
|
||||
TEST_GETENV="foo bar baz" ; export TEST_GETENV
|
||||
echo "Test 1" > ../tests/test.out
|
||||
echo "" >> ../tests/test.out
|
||||
../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v 'TimetIs64bit' >> ../tests/test.out
|
||||
../src/remind --flush -e -dxte ../tests/test.rem 16 feb 1991 12:13 2>&1 | grep -v -a 'TimetIs64bit' >> ../tests/test.out 2>&1
|
||||
echo "" >> ../tests/test.out
|
||||
echo "Test 2" >> ../tests/test.out
|
||||
echo "" >> ../tests/test.out
|
||||
@@ -618,7 +618,7 @@ rm -f ../tests/once.timestamp
|
||||
../src/remind --flush -q ../tests/dedupe.rem 8 November 2023 >> ../tests/test.out 2>&1
|
||||
|
||||
# Remove references to SysInclude, which is build-specific
|
||||
grep -F -v '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
|
||||
grep -F -v -a '$SysInclude' < ../tests/test.out > ../tests/test.out.1 && mv -f ../tests/test.out.1 ../tests/test.out
|
||||
|
||||
# If "man" accepts the --warnings flag, test all the man pages.
|
||||
RUNMAN=0
|
||||
@@ -785,7 +785,7 @@ echo "... and here is stderr" >> ../tests/test.out 2>&1
|
||||
|
||||
# Test %: substitution sequence in all the languages
|
||||
for i in ../include/lang/??.rem ; do
|
||||
../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep 2025/ >> ../tests/test.out
|
||||
../src/remind --flush "-ii=\"$i\"" -p - 2025-08-13 <<'EOF' 2>&1 | grep -a 2025/ >> ../tests/test.out
|
||||
DO [i]
|
||||
REM TODO 2025-08-13 MSG %(LANGID) Task1%:
|
||||
REM TODO 2025-08-13 COMPLETE-THROUGH 2025-08-12 MSG %(LANGID) Task2%:
|
||||
|
||||
+41
-1
@@ -16592,8 +16592,43 @@ Leaving UserFN c() => 33
|
||||
|
||||
DEBUG -xe
|
||||
Overridden: subst_colon subst_bang subst_question subst_at subst_hash
|
||||
bad => "ÿ"
|
||||
mbstrlen("ÿ") => Invalid multibyte sequence
|
||||
../tests/test.rem(1734): mbstrlen(): Invalid multibyte sequence
|
||||
bad => "ÿ"
|
||||
strlen("ÿ") => 1
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
mbstrlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 15
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
strlen("🙂🙂🙂🙂🙂xyzçççéfoo") => 34
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 9
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç") => 24
|
||||
bad => "ÿ"
|
||||
bad => "ÿ"
|
||||
mbindex("ÿ", "ÿ") => Invalid multibyte sequence
|
||||
../tests/test.rem(1742): mbindex(): Invalid multibyte sequence
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
mbindex("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 11) => 11
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
index("🙂🙂🙂🙂🙂xyzçççéfoo", "ç", 25) => 26
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "🙂🙂🙂🙂xyzçççéfoo"
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
mbsubstr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "🙂🙂🙂🙂xyzç"
|
||||
bad => "ÿ"
|
||||
mbsubstr("ÿ", 1) => Invalid multibyte sequence
|
||||
../tests/test.rem(1749): mbsubstr(): Invalid multibyte sequence
|
||||
bad => "ÿ"
|
||||
mbsubstr("ÿ", 1, 20) => Invalid multibyte sequence
|
||||
../tests/test.rem(1750): mbsubstr(): Invalid multibyte sequence
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2) => "Ÿ™‚🙂🙂🙂🙂xyzçççéfoo"
|
||||
faces => "🙂🙂🙂🙂🙂xyzçççéfoo"
|
||||
substr("🙂🙂🙂🙂🙂xyzçççéfoo", 2, 9) => "Ÿ™‚🙂ð"
|
||||
Variable hash table statistics:
|
||||
Entries: 100144; Buckets: 87719; Non-empty Buckets: 66301
|
||||
Entries: 100146; Buckets: 87719; Non-empty Buckets: 66303
|
||||
Maxlen: 5; Minlen: 0; Avglen: 1.142; Stddev: 0.878; Avg nonempty len: 1.510
|
||||
Growths: 13; Shrinks: 0
|
||||
Function hash table statistics:
|
||||
@@ -24589,6 +24624,9 @@ language
|
||||
localtoutc
|
||||
lower
|
||||
max
|
||||
mbindex
|
||||
mbstrlen
|
||||
mbsubstr
|
||||
min
|
||||
minsfromutc
|
||||
minute
|
||||
@@ -24993,6 +25031,8 @@ TRANSLATE "MAX-OVERDUE specified twice" ""
|
||||
TRANSLATE "MAX-OVERDUE specified without TODO" ""
|
||||
TRANSLATE "TZ specified twice" ""
|
||||
TRANSLATE "TZ specified for non-timed reminder" ""
|
||||
TRANSLATE "C library does not support multibyte characters" ""
|
||||
TRANSLATE "Invalid multibyte sequence" ""
|
||||
|
||||
# Other Messages
|
||||
TRANSLATE "%s function `%s' defined at %s(%s) does not use its argument" ""
|
||||
|
||||
@@ -1725,7 +1725,34 @@ fset subst_hash(a, b, c) "subst_hash"
|
||||
|
||||
REM MSG Overridden: %: %! %? %@ %#
|
||||
|
||||
# mbstrlen and friends
|
||||
DEBUG -xe
|
||||
set bad char(255)
|
||||
set faces "🙂" * 5 + "xyz" + "çççéfoo"
|
||||
|
||||
DEBUG +x
|
||||
set a mbstrlen(bad)
|
||||
set a strlen(bad)
|
||||
|
||||
set a mbstrlen(faces)
|
||||
set a strlen(faces)
|
||||
|
||||
set a mbindex(faces, "ç")
|
||||
set a index(faces, "ç")
|
||||
set a mbindex(bad, bad)
|
||||
|
||||
set a mbindex(faces, "ç", 11)
|
||||
set a index(faces, "ç", 25)
|
||||
|
||||
set a mbsubstr(faces, 2)
|
||||
set a mbsubstr(faces, 2, 9)
|
||||
set a mbsubstr(bad, 1)
|
||||
set a mbsubstr(bad, 1, 20)
|
||||
|
||||
set a substr(faces, 2)
|
||||
set a substr(faces, 2, 9)
|
||||
|
||||
DEBUG -x
|
||||
# Don't want Remind to queue reminders
|
||||
EXIT
|
||||
|
||||
|
||||
Reference in New Issue
Block a user