Skip to content

Commit dace60f

Browse files
committed
win32.c: make reading UTF-8 characters from the console possible
Due to a bug in Windows, ReadFile() and ReadConsoleA() (and thus _read()), return zeros instead of non-ASCII characters when the console codepage is set to 65001. See this ticket for more details: microsoft/terminal#4551 This commit works around that bug by using ReadConsoleW() inside win32_read() when the passed fd points to the console and the console codepage is set to 65001. Fixes #18701
1 parent 44646a1 commit dace60f

File tree

1 file changed

+125
-1
lines changed

1 file changed

+125
-1
lines changed

win32/win32.c

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ static const SYSTEMTIME time_t_epoch_base_systemtime = {
196196

197197
#define FILETIME_CHUNKS_PER_SECOND (10000000UL)
198198

199+
#ifdef USE_ITHREADS
200+
static perl_mutex win32_read_console_mutex;
201+
#endif
202+
199203
#ifdef SET_INVALID_PARAMETER_HANDLER
200204
static BOOL silent_invalid_parameter_handler = FALSE;
201205

@@ -3743,10 +3747,128 @@ win32_dup2(int fd1,int fd2)
37433747
return dup2(fd1,fd2);
37443748
}
37453749

3750+
static int
3751+
win32_read_console(int fd, U8 *buf, unsigned int cnt)
3752+
{
3753+
/* This function is a workaround for a bug in Windows:
3754+
* https://github.com/microsoft/terminal/issues/4551
3755+
* tl;dr: ReadFile() and ReadConsoleA() return garbage when reading
3756+
* non-ASCII characters from the console with the 65001 codepage.
3757+
*/
3758+
HANDLE h = (HANDLE)_get_osfhandle(fd);
3759+
size_t left_to_read = cnt;
3760+
DWORD mode;
3761+
3762+
if (h == INVALID_HANDLE_VALUE) {
3763+
errno = EBADF;
3764+
return -1;
3765+
}
3766+
3767+
if (!GetConsoleMode(h, &mode)) {
3768+
translate_to_errno();
3769+
return -1;
3770+
}
3771+
3772+
while (left_to_read) {
3773+
/* The purpose of converted_buf is to preserve partial UTF-8 (or of any
3774+
* other multibyte encoding) code points between read() calls. Since
3775+
* there's only one console, the buffer is global. It's needed because
3776+
* ReadConsoleW() returns a string of UTF-16 code units and its result,
3777+
* after conversion to the current console codepage, may not fit in the
3778+
* return buffer.
3779+
*
3780+
* The buffer's size is 8 because it will contain at most two UTF-8 code
3781+
* points.
3782+
*/
3783+
static char converted_buf[8];
3784+
static size_t converted_buf_len = 0;
3785+
WCHAR wbuf[2];
3786+
DWORD wbuf_len = 0, chars_read;
3787+
3788+
if (converted_buf_len) {
3789+
bool newline = 0;
3790+
size_t to_write = MIN(converted_buf_len, left_to_read);
3791+
3792+
/* Don't read anything if the *first* character is ^Z and
3793+
* ENABLE_PROCESSED_INPUT is enabled. On some versions of Windows,
3794+
* ReadFile() ignores ENABLE_PROCESSED_INPUT, but apparently it's a
3795+
* bug: https://github.com/microsoft/terminal/issues/4958
3796+
*/
3797+
if (left_to_read == cnt && (mode & ENABLE_PROCESSED_INPUT) &&
3798+
converted_buf[0] == 0x1a)
3799+
break;
3800+
3801+
/* Are we returning a newline? */
3802+
if (memchr(converted_buf, '\n', to_write))
3803+
newline = 1;
3804+
3805+
memcpy(buf, converted_buf, to_write);
3806+
buf += to_write;
3807+
3808+
/* If there's anything left in converted_buf, move it to the
3809+
* beginning of the buffer. */
3810+
converted_buf_len -= to_write;
3811+
if (converted_buf_len)
3812+
memmove(
3813+
converted_buf, converted_buf + to_write, converted_buf_len
3814+
);
3815+
3816+
left_to_read -= to_write;
3817+
3818+
/* With ENABLE_LINE_INPUT enabled, we stop reading after the first
3819+
* newline, otherwise we stop reading after the first character. */
3820+
if (!left_to_read || newline || (mode & ENABLE_LINE_INPUT) == 0)
3821+
break;
3822+
}
3823+
3824+
/* Reading one code unit at a time is inefficient, but since this code
3825+
* is used only for the interactive console, that shouldn't matter. */
3826+
if (!ReadConsoleW(h, wbuf, 1, &chars_read, 0)) {
3827+
translate_to_errno();
3828+
return -1;
3829+
}
3830+
if (!chars_read)
3831+
break;
3832+
3833+
++wbuf_len;
3834+
3835+
if (wbuf[0] >= 0xD800 && wbuf[0] <= 0xDBFF) {
3836+
/* High surrogate, read one more code unit. */
3837+
if (!ReadConsoleW(h, wbuf + 1, 1, &chars_read, 0)) {
3838+
translate_to_errno();
3839+
return -1;
3840+
}
3841+
if (chars_read)
3842+
++wbuf_len;
3843+
}
3844+
3845+
converted_buf_len = WideCharToMultiByte(
3846+
GetConsoleCP(), 0, wbuf, wbuf_len, converted_buf,
3847+
sizeof(converted_buf), NULL, NULL
3848+
);
3849+
if (!converted_buf_len) {
3850+
translate_to_errno();
3851+
return -1;
3852+
}
3853+
}
3854+
3855+
return cnt - left_to_read;
3856+
}
3857+
3858+
37463859
DllExport int
37473860
win32_read(int fd, void *buf, unsigned int cnt)
37483861
{
3749-
return read(fd, buf, cnt);
3862+
int ret;
3863+
if (UNLIKELY(win32_isatty(fd) && GetConsoleCP() == 65001)) {
3864+
MUTEX_LOCK(&win32_read_console_mutex);
3865+
ret = win32_read_console(fd, buf, cnt);
3866+
MUTEX_UNLOCK(&win32_read_console_mutex);
3867+
}
3868+
else
3869+
ret = read(fd, buf, cnt);
3870+
3871+
return ret;
37503872
}
37513873

37523874
DllExport int
@@ -4907,6 +5029,8 @@ Perl_win32_init(int *argcp, char ***argvp)
49075029
time_t_epoch_base_filetime.LowPart = ft.dwLowDateTime;
49085030
time_t_epoch_base_filetime.HighPart = ft.dwHighDateTime;
49095031
}
5032+
5033+
MUTEX_INIT(&win32_read_console_mutex);
49105034
}
49115035

49125036
void

0 commit comments

Comments
 (0)