From 1694bc733d3ace8072c58e7da457dd2995998189 Mon Sep 17 00:00:00 2001 From: Damien George Date: Wed, 16 Jul 2014 11:45:10 +0100 Subject: [PATCH] py: Add stream reading of n unicode chars; unicode support by default. With unicode enabled, this patch allows reading a fixed number of characters from text-mode streams; eg file.read(5) will read 5 unicode chars, which can made of more than 5 bytes. For an ASCII stream (ie no chars > 127) it only needs to do 1 read. If there are lots of non-ASCII chars in a stream, then it needs multiple reads of the underlying object. Adds a new test for this case. Enables unicode support by default on unix and stmhal ports. --- py/stream.c | 88 +++++++++++++++++++++++++++++++++- stmhal/mpconfigport.h | 2 +- tests/run-tests | 2 +- tests/unicode/data/utf-8_2.txt | 1 + tests/unicode/file2.py | 12 +++++ unix/mpconfigport.h | 2 +- 6 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 tests/unicode/data/utf-8_2.txt create mode 100644 tests/unicode/file2.py diff --git a/py/stream.c b/py/stream.c index 2b4410728..4c8b8a570 100644 --- a/py/stream.c +++ b/py/stream.c @@ -67,6 +67,9 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) { nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported")); } + // What to do if sz < -1? Python docs don't specify this case. + // CPython does a readall, but here we silently let negatives through, + // and they will cause a MemoryError. mp_int_t sz; if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) { return stream_readall(args[0]); @@ -74,7 +77,90 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) { #if MICROPY_PY_BUILTINS_STR_UNICODE if (!o->type->stream_p->is_bytes) { - mp_not_implemented("Reading from unicode text streams by character count"); + // We need to read sz number of unicode characters. Because we don't have any + // buffering, and because the stream API can only read bytes, we must read here + // in units of bytes and must never over read. If we want sz chars, then reading + // sz bytes will never over-read, so we follow this approach, in a loop to keep + // reading until we have exactly enough chars. This will be 1 read for text + // with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII + // chars. For text with lots of non-ASCII chars, it'll be pretty inefficient + // in time and memory. + + vstr_t vstr; + vstr_init(&vstr, sz); + mp_uint_t more_bytes = sz; + mp_uint_t last_buf_offset = 0; + while (more_bytes > 0) { + char *p = vstr_add_len(&vstr, more_bytes); + if (p == NULL) { + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory")); + } + int error; + mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error); + if (out_sz == -1) { + vstr_cut_tail_bytes(&vstr, more_bytes); + if (is_nonblocking_error(error)) { + // With non-blocking streams, we read as much as we can. + // If we read nothing, return None, just like read(). + // Otherwise, return data read so far. + // TODO what if we have read only half a non-ASCII char? + if (vstr.len == 0) { + vstr_clear(&vstr); + return mp_const_none; + } + break; + } + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error)); + } + + if (out_sz == 0) { + // Finish reading. + // TODO what if we have read only half a non-ASCII char? + vstr_cut_tail_bytes(&vstr, more_bytes); + break; + } + + // count chars from bytes just read + for (mp_uint_t off = last_buf_offset;;) { + byte b = vstr.buf[off]; + int n; + if (!UTF8_IS_NONASCII(b)) { + // 1-byte ASCII char + n = 1; + } else if ((b & 0xe0) == 0xc0) { + // 2-byte char + n = 2; + } else if ((b & 0xf0) == 0xe0) { + // 3-byte char + n = 3; + } else if ((b & 0xf8) == 0xf0) { + // 4-byte char + n = 4; + } else { + // TODO + n = 5; + } + if (off + n <= vstr.len) { + // got a whole char in n bytes + off += n; + sz -= 1; + last_buf_offset = off; + if (off >= vstr.len) { + more_bytes = sz; + break; + } + } else { + // didn't get a whole char, so work out how many extra bytes are needed for + // this partial char, plus bytes for additional chars that we want + more_bytes = (off + n - vstr.len) + (sz - 1); + break; + } + } + } + + mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len); + vstr_clear(&vstr); + return ret; } #endif diff --git a/stmhal/mpconfigport.h b/stmhal/mpconfigport.h index 00afa989c..95f142ca4 100644 --- a/stmhal/mpconfigport.h +++ b/stmhal/mpconfigport.h @@ -44,7 +44,7 @@ */ #define MICROPY_ENABLE_LFN (1) #define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */ -#define MICROPY_PY_BUILTINS_STR_UNICODE (0) +#define MICROPY_PY_BUILTINS_STR_UNICODE (1) #define MICROPY_PY_BUILTINS_FROZENSET (1) #define MICROPY_PY_SYS_EXIT (1) #define MICROPY_PY_SYS_STDFILES (1) diff --git a/tests/run-tests b/tests/run-tests index 71a94f946..4b48421de 100755 --- a/tests/run-tests +++ b/tests/run-tests @@ -134,7 +134,7 @@ def main(): if args.test_dirs is None: if pyb is None: # run PC tests - test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc') + test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode') else: # run pyboard tests test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm') diff --git a/tests/unicode/data/utf-8_2.txt b/tests/unicode/data/utf-8_2.txt new file mode 100644 index 000000000..ab0eaa4e0 --- /dev/null +++ b/tests/unicode/data/utf-8_2.txt @@ -0,0 +1 @@ +aαbβcγdδ diff --git a/tests/unicode/file2.py b/tests/unicode/file2.py new file mode 100644 index 000000000..aca2e0e0e --- /dev/null +++ b/tests/unicode/file2.py @@ -0,0 +1,12 @@ +# test reading a given number of characters + +def do(mode): + f = open('unicode/data/utf-8_2.txt', mode) + print(f.read(1)) + print(f.read(1)) + print(f.read(2)) + print(f.read(4)) + f.close() + +do('rb') +do('rt') diff --git a/unix/mpconfigport.h b/unix/mpconfigport.h index 0831e3fd3..ce4365d36 100644 --- a/unix/mpconfigport.h +++ b/unix/mpconfigport.h @@ -43,7 +43,7 @@ #define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ) #define MICROPY_STREAMS_NON_BLOCK (1) #define MICROPY_OPT_COMPUTED_GOTO (1) -#define MICROPY_PY_BUILTINS_STR_UNICODE (0) +#define MICROPY_PY_BUILTINS_STR_UNICODE (1) #define MICROPY_PY_BUILTINS_FROZENSET (1) #define MICROPY_PY_SYS_EXIT (1) #define MICROPY_PY_SYS_PLATFORM "linux"