/* +----------------------------------------------------------------------+ | PHP Version 6 | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Authors: Andrei Zmievski | +----------------------------------------------------------------------+ */ /* $Id$ */ /* * TODO * * - test with empty and 1 character strings * - optimize current() to pass return_value to the handler so that it fills it * in directly instead of creating a new zval * - implement Countable (or count_elements handler) and Seekable interfaces */ #include "php.h" #include "zend_interfaces.h" #include "zend_exceptions.h" #include typedef enum { ITER_CODE_POINT, ITER_COMB_SEQUENCE, ITER_CHARACTER, ITER_WORD, ITER_LINE, ITER_SENTENCE, ITER_TYPE_LAST, } text_iter_type; static const uint32_t ITER_REVERSE = 0x100; static const uint32_t ITER_TYPE_MASK = 0xFF; typedef struct { zend_object std; UChar* text; uint32_t text_len; text_iter_type type; zval* current; size_t current_alloc; long flags; union { struct { int32_t offset; int32_t cp_offset; int32_t index; } cp; struct { int32_t start; int32_t end; int32_t index; int32_t start_cp_offset; int32_t end_cp_offset; } cs; struct { int32_t bound; int32_t next; int32_t index; int32_t cp_offset; UBreakIterator *iter; UBreakIterator *n_iter; } brk; } u; zend_object_iterator iter; } text_iter_obj; static inline text_iter_obj* text_iter_to_obj(zend_object_iterator *iter) { return (text_iter_obj *)((char*)iter - offsetof(text_iter_obj, iter)); } typedef struct { int (*valid) (text_iter_obj* object, long flags TSRMLS_DC); void (*current) (text_iter_obj* object, long flags TSRMLS_DC); int (*key) (text_iter_obj* object, long flags TSRMLS_DC); int (*offset) (text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC); void (*next) (text_iter_obj* object, long flags TSRMLS_DC); void (*rewind) (text_iter_obj* object, long flags TSRMLS_DC); void (*following) (text_iter_obj* object, int32_t offset, long flags TSRMLS_DC); zend_bool (*isBoundary)(text_iter_obj* object, int32_t offset, long flags TSRMLS_DC); } text_iter_ops; enum UBreakIteratorType brk_type_map[] = { UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE, }; PHPAPI zend_class_entry* text_iterator_ce; PHPAPI zend_class_entry* rev_text_iterator_ce; /* Code point ops */ static int text_iter_cp_valid(text_iter_obj* object, long flags TSRMLS_DC) { if (object->u.cp.offset == UBRK_DONE) { return 0; } if (flags & ITER_REVERSE) { return (object->u.cp.offset != 0); } else { return (object->u.cp.offset != object->text_len); } } static void text_iter_cp_current(text_iter_obj* object, long flags TSRMLS_DC) { UChar32 cp = 0; int32_t tmp, buf_len = 0; tmp = object->u.cp.offset; if (flags & ITER_REVERSE) { if (object->u.cp.offset != UBRK_DONE && object->u.cp.offset > 0) { U16_PREV(object->text, 0, tmp, cp); buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current)); } } else { if (object->u.cp.offset != UBRK_DONE && object->u.cp.offset < object->text_len) { U16_NEXT(object->text, tmp, object->text_len, cp); buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current)); } } Z_USTRVAL_P(object->current)[buf_len] = 0; Z_USTRLEN_P(object->current) = buf_len; } static int text_iter_cp_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.cp.index; } static int text_iter_cp_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC) { if (cp_offset) { *cp_offset = object->u.cp.cp_offset; } return object->u.cp.offset; } static void text_iter_cp_next(text_iter_obj* object, long flags TSRMLS_DC) { if (object->u.cp.offset == UBRK_DONE) { return; } if (flags & ITER_REVERSE) { U16_BACK_1(object->text, 0, object->u.cp.offset); if (object->u.cp.offset <= object->text_len) { object->u.cp.cp_offset--; } else { object->u.cp.offset = object->u.cp.cp_offset = UBRK_DONE; } } else { U16_FWD_1(object->text, object->u.cp.offset, object->text_len); if (object->u.cp.offset <= object->text_len) { object->u.cp.cp_offset++; } else { object->u.cp.offset = object->u.cp.cp_offset = UBRK_DONE; } } object->u.cp.index++; } static void text_iter_cp_rewind(text_iter_obj *object, long flags TSRMLS_DC) { if (flags & ITER_REVERSE) { object->u.cp.offset = object->text_len; object->u.cp.cp_offset = u_countChar32(object->text, object->text_len); } else { object->u.cp.offset = 0; object->u.cp.cp_offset = 0; } object->u.cp.index = 0; } static void text_iter_cp_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { int32_t k; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.cp.cp_offset == UBRK_DONE) { object->u.cp.cp_offset = 0; object->u.cp.offset = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = object->u.cp.offset; if (offset > object->u.cp.cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.cp.cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.cp.cp_offset - offset); } /* * Locate the actual boundary. */ if (flags & ITER_REVERSE) { if (k == 0) { object->u.cp.cp_offset = UBRK_DONE; object->u.cp.offset = UBRK_DONE; return; } else { U16_BACK_1(object->text, 0, k); } } else { if (k == object->text_len) { object->u.cp.cp_offset = UBRK_DONE; object->u.cp.offset = UBRK_DONE; return; } else { U16_FWD_1(object->text, k, object->text_len); } } /* * If boundary is the same one as where we were at before, simply return. */ if (k == object->u.cp.offset) { return; } /* * Adjust the internal codepoint offset based on how far we've moved. */ if (k > object->u.cp.offset) { if (k - object->u.cp.offset > 1) { object->u.cp.cp_offset += u_countChar32(object->text + object->u.cp.offset, k - object->u.cp.offset); } else { object->u.cp.cp_offset++; } } else { if (object->u.cp.offset - k > 1) { object->u.cp.cp_offset -= u_countChar32(object->text + k, object->u.cp.offset - k); } else { object->u.cp.cp_offset--; } } object->u.cp.offset = k; } static zend_bool text_iter_cp_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { int32_t k; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.cp.cp_offset == UBRK_DONE) { object->u.cp.cp_offset = 0; object->u.cp.offset = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = object->u.cp.offset; if (offset > object->u.cp.cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.cp.cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.cp.cp_offset - offset); } if (k == object->text_len) { object->u.cp.cp_offset += u_countChar32(object->text + object->u.cp.offset, k - object->u.cp.offset); } else { object->u.cp.cp_offset = offset; } object->u.cp.offset = k; /* * Every codepoint is a boundary. */ return TRUE; } static text_iter_ops text_iter_cp_ops = { text_iter_cp_valid, text_iter_cp_current, text_iter_cp_key, text_iter_cp_offset, text_iter_cp_next, text_iter_cp_rewind, text_iter_cp_following, text_iter_cp_isBoundary, }; /* Combining sequence ops */ static void text_iter_helper_move(zend_bool forward, UChar *text, int32_t text_len, int32_t *offset, int32_t *cp_offset) { UChar32 cp; int32_t tmp, tmp2; if (*offset == UBRK_DONE) { return; } if (forward) { if (*offset == text_len) { *offset = UBRK_DONE; *cp_offset = UBRK_DONE; } else { U16_NEXT(text, (*offset), text_len, cp); (*cp_offset)++; if (u_getCombiningClass(cp) == 0) { tmp = *offset; tmp2 = *cp_offset; /* * At the end of the string cp will be 0 because of the NULL * terminating NULL, so combining class will be 0 as well. */ while (tmp < text_len) { U16_NEXT(text, tmp, text_len, cp); tmp2++; if (u_getCombiningClass(cp) == 0) { break; } else { *offset = tmp; *cp_offset = tmp2; } } } } } else { if (*offset == 0) { *offset = UBRK_DONE; *cp_offset = UBRK_DONE; } else { U16_PREV(text, 0, (*offset), cp); (*cp_offset)--; if (u_getCombiningClass(cp) != 0) { do { U16_PREV(text, 0, (*offset), cp); (*cp_offset)--; } while (*offset > 0 && u_getCombiningClass(cp) != 0); } } } } static int text_iter_cs_valid(text_iter_obj* object, long flags TSRMLS_DC) { if (object->u.cs.start == UBRK_DONE) { return 0; } if (flags & ITER_REVERSE) { return (object->u.cs.start != 0); } else { return (object->u.cs.start != object->text_len); } } static void text_iter_cs_current(text_iter_obj* object, long flags TSRMLS_DC) { UChar *start; int32_t length = -1; if (object->u.cs.start != UBRK_DONE) { if (flags & ITER_REVERSE) { if (object->u.cs.end == object->u.cs.start) { text_iter_helper_move(0, object->text, object->text_len, &object->u.cs.end, &object->u.cs.end_cp_offset); } start = object->text + object->u.cs.end; } else { if (object->u.cs.end == object->u.cs.start) { text_iter_helper_move(1, object->text, object->text_len, &object->u.cs.end, &object->u.cs.end_cp_offset); } start = object->text + object->u.cs.start; } if (object->u.cs.end == UBRK_DONE) { length = 0; } else { length = abs(object->u.cs.end - object->u.cs.start); } } else { length = 0; } if (length != 0) { if (length+1 > object->current_alloc) { object->current_alloc = length+1; Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); } u_memcpy(Z_USTRVAL_P(object->current), start, length); } Z_USTRVAL_P(object->current)[length] = 0; Z_USTRLEN_P(object->current) = length; } static int text_iter_cs_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.cs.index; } static int text_iter_cs_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC) { if (cp_offset) { *cp_offset = object->u.cs.start_cp_offset; } return object->u.cs.start; } static void text_iter_cs_next(text_iter_obj* object, long flags TSRMLS_DC) { if (object->u.cs.start == UBRK_DONE) { return; } if (flags & ITER_REVERSE) { text_iter_helper_move(0, object->text, object->text_len, &object->u.cs.start, &object->u.cs.start_cp_offset); } else { text_iter_helper_move(1, object->text, object->text_len, &object->u.cs.start, &object->u.cs.start_cp_offset); } object->u.cs.end = object->u.cs.start; object->u.cs.end_cp_offset = object->u.cs.start_cp_offset; object->u.cs.index++; } static void text_iter_cs_rewind(text_iter_obj *object, long flags TSRMLS_DC) { if (flags & ITER_REVERSE) { object->u.cs.start = object->u.cs.end = object->text_len; object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = u_countChar32(object->text, object->text_len); } else { object->u.cs.start = object->u.cs.end = 0; object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = 0; } object->u.cs.index = 0; /* because _next increments index */ } static void text_iter_cs_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { int32_t k; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.cs.start_cp_offset == UBRK_DONE) { object->u.cs.start_cp_offset = 0; object->u.cs.start = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = object->u.cs.start; if (offset > object->u.cs.start_cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.cs.start_cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.cs.start_cp_offset - offset); } /* * Locate the actual boundary. */ if (flags & ITER_REVERSE) { /* * If offset was at or beyond the length of text, we need to adjust it * to the number of codepoints in the text. */ if (k == object->text_len) { offset = u_countChar32(object->text, object->text_len); } text_iter_helper_move(0, object->text, object->text_len, &k, &offset); } else { text_iter_helper_move(1, object->text, object->text_len, &k, &offset); } if (k == object->u.cs.start) { return; } object->u.cs.start = k; object->u.cs.start_cp_offset = offset; object->u.cs.end = object->u.cs.start; } static zend_bool text_iter_cs_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { UChar32 cp; int32_t k, tmp; zend_bool result; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.cs.start_cp_offset == UBRK_DONE) { object->u.cs.start_cp_offset = 0; object->u.cs.start = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = object->u.cs.start; if (offset > object->u.cs.start_cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.cs.start_cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.cs.start_cp_offset - offset); } /* end of the text is always a boundary */ if (k == object->text_len) { offset = u_countChar32(object->text, object->text_len); result = 1; } else { /* if the next codepoint is a base character, it's a boundary */ tmp = k; U16_NEXT(object->text, tmp, object->text_len, cp); result = (u_getCombiningClass(cp) == 0); } if (k == object->u.cs.start) { return result; } object->u.cs.start = k; object->u.cs.start_cp_offset = offset; object->u.cs.end = object->u.cs.start; return result; } static text_iter_ops text_iter_cs_ops = { text_iter_cs_valid, text_iter_cs_current, text_iter_cs_key, text_iter_cs_offset, text_iter_cs_next, text_iter_cs_rewind, text_iter_cs_following, text_iter_cs_isBoundary, }; /* UBreakIterator Ops */ static int text_iter_brk_valid(text_iter_obj* object, long flags TSRMLS_DC) { if (object->u.brk.bound == UBRK_DONE) { return 0; } if (flags & ITER_REVERSE) { return (object->u.brk.bound != 0); } else { return (object->u.brk.bound != object->text_len); } } static void text_iter_brk_current(text_iter_obj* object, long flags TSRMLS_DC) { UChar *start; int32_t length = -1; if (object->u.brk.bound != UBRK_DONE) { if (flags & ITER_REVERSE) { if (object->u.brk.next == object->u.brk.bound) { object->u.brk.next = ubrk_preceding(object->u.brk.n_iter, object->u.brk.bound); } start = object->text + object->u.brk.next; } else { if (object->u.brk.next == object->u.brk.bound) { object->u.brk.next = ubrk_following(object->u.brk.n_iter, object->u.brk.bound); } start = object->text + object->u.brk.bound; } if (object->u.brk.next == UBRK_DONE) { length = 0; } else { length = abs(object->u.brk.next - object->u.brk.bound); } } else { length = 0; } if (length != 0) { if (length+1 > object->current_alloc) { object->current_alloc = length+1; Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); } u_memcpy(Z_USTRVAL_P(object->current), start, length); } Z_USTRVAL_P(object->current)[length] = 0; Z_USTRLEN_P(object->current) = length; } static int text_iter_brk_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.brk.index; } static int text_iter_brk_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC) { if (cp_offset) { *cp_offset = object->u.brk.cp_offset; } return object->u.brk.bound; } static void text_iter_brk_next(text_iter_obj* object, long flags TSRMLS_DC) { int32_t tmp = object->u.brk.bound; if (object->u.brk.bound == UBRK_DONE) { return; } if (flags & ITER_REVERSE) { object->u.brk.bound = ubrk_previous(object->u.brk.iter); object->u.brk.next = object->u.brk.bound; if (object->u.brk.bound != UBRK_DONE) { if (tmp - object->u.brk.bound > 1) { object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound); } else { object->u.brk.cp_offset--; } } else { object->u.brk.cp_offset = UBRK_DONE; } } else { object->u.brk.bound = ubrk_next(object->u.brk.iter); object->u.brk.next = object->u.brk.bound; if (object->u.brk.bound != UBRK_DONE) { if (object->u.brk.bound - tmp > 1) { object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp); } else { object->u.brk.cp_offset++; } } else { object->u.brk.cp_offset = UBRK_DONE; } } object->u.brk.index++; } static void text_iter_brk_rewind(text_iter_obj *object, long flags TSRMLS_DC) { if (flags & ITER_REVERSE) { object->u.brk.bound = ubrk_last(object->u.brk.iter); object->u.brk.next = ubrk_last(object->u.brk.n_iter); object->u.brk.cp_offset = u_countChar32(object->text, object->u.brk.bound); } else { object->u.brk.bound = ubrk_first(object->u.brk.iter); object->u.brk.next = ubrk_first(object->u.brk.n_iter); object->u.brk.cp_offset = 0; } object->u.brk.index = 0; } static void text_iter_brk_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { int32_t k, tmp; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.brk.cp_offset == UBRK_DONE) { object->u.brk.cp_offset = 0; object->u.brk.bound = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = tmp = object->u.brk.bound; if (offset > object->u.brk.cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.brk.cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.brk.cp_offset - offset); } /* * Locate the actual boundary. */ if (flags & ITER_REVERSE) { object->u.brk.bound = ubrk_preceding(object->u.brk.iter, k); } else { object->u.brk.bound = ubrk_following(object->u.brk.iter, k); } /* * If boundary is the same one as where we were at before, simply return. */ if (object->u.brk.bound == tmp) { return; } object->u.brk.next = object->u.brk.bound; /* * Adjust the internal codepoint offset based on how far we've moved. */ if (object->u.brk.bound != UBRK_DONE) { if (object->u.brk.bound > tmp) { if (object->u.brk.bound - tmp > 1) { object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp); } else { object->u.brk.cp_offset++; } } else { if (tmp - object->u.brk.bound > 1) { object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound); } else { object->u.brk.cp_offset--; } } } else { object->u.brk.cp_offset = UBRK_DONE; } } static zend_bool text_iter_brk_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC) { int32_t k, tmp; UBool result; if (offset < 0) { offset = 0; } /* * On invalid iterator we always want to start looking for the code unit * offset from the beginning of the string. */ if (object->u.brk.cp_offset == UBRK_DONE) { object->u.brk.cp_offset = 0; object->u.brk.bound = 0; } /* * Try to locate the code unit position relative to the last known codepoint * offset. */ k = tmp = object->u.brk.bound; if (offset > object->u.brk.cp_offset) { U16_FWD_N(object->text, k, object->text_len, offset - object->u.brk.cp_offset); } else { U16_BACK_N(object->text, 0, k, object->u.brk.cp_offset - offset); } result = ubrk_isBoundary(object->u.brk.iter, k); object->u.brk.bound = ubrk_current(object->u.brk.iter); object->u.brk.next = object->u.brk.bound; /* * If boundary is the same one as where we were at before, simply return. */ if (object->u.brk.bound == tmp) { return result; } /* * Adjust the internal codepoint offset based on how far we've moved. */ if (object->u.brk.bound != UBRK_DONE) { if (object->u.brk.bound > tmp) { if (object->u.brk.bound - tmp > 1) { object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp); } else { object->u.brk.cp_offset++; } } else { if (tmp - object->u.brk.bound > 1) { object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound); } else { object->u.brk.cp_offset--; } } } else { object->u.brk.cp_offset = UBRK_DONE; } return result; } static text_iter_ops text_iter_brk_ops = { text_iter_brk_valid, text_iter_brk_current, text_iter_brk_key, text_iter_brk_offset, text_iter_brk_next, text_iter_brk_rewind, text_iter_brk_following, text_iter_brk_isBoundary, }; /* Ops array */ static text_iter_ops* iter_ops[] = { &text_iter_cp_ops, &text_iter_cs_ops, &text_iter_brk_ops, &text_iter_brk_ops, &text_iter_brk_ops, &text_iter_brk_ops, }; /* Iterator Funcs */ static void text_iter_dtor(zend_object_iterator* iter TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); zval *object = obj->iter.data; zval_ptr_dtor(&object); } static int text_iter_valid(zend_object_iterator* iter TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); if (iter_ops[obj->type]->valid(obj, obj->flags TSRMLS_CC)) { return SUCCESS; } else { return FAILURE; } } static void text_iter_get_current_data(zend_object_iterator* iter, zval*** data TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); iter_ops[obj->type]->current(obj, obj->flags TSRMLS_CC); *data = &obj->current; } static int text_iter_get_current_key(zend_object_iterator* iter, zstr *str_key, uint *str_key_len, ulong *int_key TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); *int_key = iter_ops[obj->type]->key(obj, obj->flags TSRMLS_CC); return HASH_KEY_IS_LONG; } static void text_iter_move_forward(zend_object_iterator* iter TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); iter_ops[obj->type]->next(obj, obj->flags TSRMLS_CC); } static void text_iter_rewind(zend_object_iterator* iter TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); iter_ops[obj->type]->rewind(obj, obj->flags TSRMLS_CC); } zend_object_iterator_funcs text_iter_funcs = { text_iter_dtor, text_iter_valid, text_iter_get_current_data, text_iter_get_current_key, text_iter_move_forward, text_iter_rewind, }; static zend_object_iterator* text_iter_get_iterator(zend_class_entry *ce, zval *object, int by_ref TSRMLS_DC) { text_iter_obj* iter_object; if (by_ref) { zend_error(E_ERROR, "An iterator cannot be used with foreach by reference"); } iter_object = (text_iter_obj *) zend_object_store_get_object(object TSRMLS_CC); ZVAL_ADDREF(object); iter_object->iter.data = (void *) object; iter_object->iter.funcs = &text_iter_funcs; return (zend_object_iterator *) &iter_object->iter; } static void text_iterator_free_storage(void *object TSRMLS_DC) { text_iter_obj *intern = (text_iter_obj *) object; zend_hash_destroy(intern->std.properties); FREE_HASHTABLE(intern->std.properties); if (intern->text) { efree(intern->text); } if (intern->type > ITER_CHARACTER) { if (intern->u.brk.iter) { ubrk_close(intern->u.brk.iter); } if (intern->u.brk.n_iter) { ubrk_close(intern->u.brk.n_iter); } } zval_ptr_dtor(&intern->current); efree(object); } static zend_object_value text_iterator_new(zend_class_entry *class_type TSRMLS_DC) { zend_object_value retval; text_iter_obj *intern; zval *tmp; intern = emalloc(sizeof(text_iter_obj)); memset(intern, 0, sizeof(text_iter_obj)); intern->std.ce = class_type; ALLOC_HASHTABLE(intern->std.properties); zend_hash_init(intern->std.properties, 0, NULL, ZVAL_PTR_DTOR, 0); zend_hash_copy(intern->std.properties, &class_type->default_properties, (copy_ctor_func_t) zval_add_ref, (void *) &tmp, sizeof(zval *)); intern->type = ITER_CODE_POINT; MAKE_STD_ZVAL(intern->current); /* pre-allocate buffer for codepoint */ intern->current_alloc = 3; Z_USTRVAL_P(intern->current) = eumalloc(3); Z_USTRVAL_P(intern->current)[0] = 0; Z_USTRLEN_P(intern->current) = 0; Z_TYPE_P(intern->current) = IS_UNICODE; retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)zend_objects_destroy_object, (zend_objects_free_object_storage_t) text_iterator_free_storage, NULL TSRMLS_CC); retval.handlers = zend_get_std_object_handlers(); return retval; } /* {{{ proto void TextIterator::__construct(unicode text [, int flags = TextIterator::CODEPOINT [, string locale ]]) U TextIterator constructor */ PHP_METHOD(TextIterator, __construct) { UChar *text; int32_t text_len; zval *object = getThis(); text_iter_obj *intern; text_iter_type ti_type; char *locale = NULL; int locale_len; long flags = 0; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "u|ls", &text, &text_len, &flags, &locale, &locale_len) == FAILURE) { return; } intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); intern->text = eustrndup(text, text_len); intern->text_len = text_len; if (ZEND_NUM_ARGS() > 1) { ti_type = flags & ITER_TYPE_MASK; if (ti_type < 0 || ti_type >= ITER_TYPE_LAST) { php_error(E_WARNING, "Invalid iterator type in TextIterator constructor"); ti_type = ITER_CODE_POINT; } intern->type = ti_type; intern->flags = flags; } if (Z_OBJCE_P(this_ptr) == rev_text_iterator_ce) { intern->flags |= ITER_REVERSE; } if (intern->type >= ITER_CHARACTER && intern->type < ITER_TYPE_LAST) { UErrorCode status = U_ZERO_ERROR; UErrorCode status2 = U_ZERO_ERROR; locale = locale ? locale : UG(default_locale); intern->u.brk.iter = ubrk_open(brk_type_map[intern->type - ITER_CHARACTER], locale, intern->text, intern->text_len, &status); intern->u.brk.n_iter = ubrk_open(brk_type_map[intern->type - ITER_CHARACTER], locale, intern->text, intern->text_len, &status); if (!U_SUCCESS(status) || !U_SUCCESS(status2)) { php_error(E_RECOVERABLE_ERROR, "Could not create UBreakIterator for '%s' locale: %s", locale, u_errorName(status)); return; } } iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC); } /* }}} */ /* {{{ proto unicode TextIterator::current() U Returns the element at the current boundary */ PHP_METHOD(TextIterator, current) { zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } iter_ops[intern->type]->current(intern, intern->flags TSRMLS_CC); RETURN_UNICODEL(Z_USTRVAL_P(intern->current), Z_USTRLEN_P(intern->current), 1); } /* }}} */ /* {{{ proto int TextIterator::next([int n]) U Advances to the n'th text boundary following the current one and returns its offset */ PHP_METHOD(TextIterator, next) { long i, step = 1; int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|l", &step) == FAILURE) { return; } if (step <= 0) { step = 1; } for (i = 0; i < step; i++) { iter_ops[intern->type]->next(intern, intern->flags TSRMLS_CC); } if (return_value_used) { iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } } /* }}} */ /* {{{ proto int TextIterator::key() U Returns the number boundaries iterated through */ PHP_METHOD(TextIterator, key) { zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } RETURN_LONG(iter_ops[intern->type]->key(intern, intern->flags TSRMLS_CC)); } /* }}} */ /* {{{ proto bool TextIterator::valid() U Determines validity of the iterator */ PHP_METHOD(TextIterator, valid) { zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } RETURN_BOOL(iter_ops[intern->type]->valid(intern, intern->flags TSRMLS_CC)); } /* }}} */ /* {{{ proto int TextIterator::first() U Positions iterator at the first character in the text and returns the offset */ PHP_METHOD(TextIterator, rewind) { int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC); if (return_value_used) { iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } } /* }}} */ /* {{{ proto int TextIterator::last() U Positions iterator beyond the last character in the text and returns the offset */ PHP_METHOD(TextIterator, last) { long flags; int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } flags = intern->flags ^ ITER_REVERSE; iter_ops[intern->type]->rewind(intern, flags TSRMLS_CC); if (return_value_used) { iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } } /* }}} */ /* {{{ proto int TextIterator::offset() U Returns the offset of the current text boundary */ PHP_METHOD(TextIterator, offset) { int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } /* }}} */ /* {{{ proto int TextIterator::previous([int n]) U Advances to the n'th text boundary preceding the current one and returns its offset */ PHP_METHOD(TextIterator, previous) { long flags, i, step = 1; int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|l", &step) == FAILURE) { return; } if (step <= 0) { step = 1; } flags = intern->flags ^ ITER_REVERSE; for (i = 0; i < step; i++) { iter_ops[intern->type]->next(intern, flags TSRMLS_CC); } if (return_value_used) { iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } } /* }}} */ /* {{{ proto int TextIterator::following(int offset) U Advances to the text boundary following the specified offset and returns its offset */ PHP_METHOD(TextIterator, following) { long offset; int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) { return; } iter_ops[intern->type]->following(intern, offset, intern->flags TSRMLS_CC); iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } /* }}} */ /* {{{ proto int TextIterator::preceding(int offset) U Advances to the text boundary preceding the specified offset and returns its offset */ PHP_METHOD(TextIterator, preceding) { long flags, offset; int32_t cp_offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) { return; } /* * ReverseTextIterator will behave in the same way as the normal one. */ flags = intern->flags | ITER_REVERSE; iter_ops[intern->type]->following(intern, offset, flags TSRMLS_CC); iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC); RETURN_LONG(cp_offset); } /* }}} */ /* {{{ proto bool TextIterator::isBoundary(int offset) U Determines whether specified offset is a text boundary */ PHP_METHOD(TextIterator, isBoundary) { long offset; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) { return; } /* * ReverseTextIterator will behave in the same way as the normal one. */ RETURN_BOOL(iter_ops[intern->type]->isBoundary(intern, offset, intern->flags TSRMLS_CC)); } /* }}} */ /* {{{ proto array TextIterator::getAvailableLocales() U Returns locales for which text boundary information is available */ PHP_METHOD(TextIterator, getAvailableLocales) { int32_t count, i; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } array_init(return_value); count = ubrk_countAvailable(); for (i = 0; i < count; i++) { add_next_index_ascii_string(return_value, (char*)ubrk_getAvailable(i), ZSTR_DUPLICATE); } } /* }}} */ /* {{{ proto int TextIterator::getRuleStatus() U Return the status from the break rule that determined the most recent boundary */ PHP_METHOD(TextIterator, getRuleStatus) { zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } if (intern->type >= ITER_CHARACTER && intern->type < ITER_TYPE_LAST) { RETURN_LONG(ubrk_getRuleStatus(intern->u.brk.iter)); } else { RETURN_LONG(0); } } /* }}} */ /* {{{ proto array TextIterator::getRuleStatusArray() U Return the statuses from the break rules that determined the most recent boundary */ PHP_METHOD(TextIterator, getRuleStatusArray) { int32_t status_vec[32], *vec_ptr = status_vec; int32_t vec_size, i; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); UErrorCode status = U_ZERO_ERROR; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } array_init(return_value); if (intern->type < ITER_CHARACTER) { add_next_index_long(return_value, 0); } else { vec_size = sizeof(status_vec) / sizeof(status_vec[0]); vec_size = ubrk_getRuleStatusVec(intern->u.brk.iter, vec_ptr, vec_size, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { vec_ptr = safe_emalloc(vec_size, sizeof(int32_t), 0); status = U_ZERO_ERROR; vec_size = ubrk_getRuleStatusVec(intern->u.brk.iter, vec_ptr, vec_size, &status); } for (i = 0; i < vec_size; i++) { add_next_index_long(return_value, vec_ptr[i]); } if (vec_ptr != status_vec) { efree(vec_ptr); } } } /* }}} */ /* {{{ proto array TextIterator::getAll() U Return all text pieces determined by the text boundaries */ PHP_METHOD(TextIterator, getAll) { int32_t start, end; zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); text_iter_ops *ops = iter_ops[intern->type]; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) { return; } array_init(return_value); ops->rewind(intern, intern->flags TSRMLS_CC); start = ops->offset(intern, intern->flags, NULL TSRMLS_CC); for (ops->next(intern, intern->flags TSRMLS_CC), end = ops->offset(intern, intern->flags, NULL TSRMLS_CC); end != UBRK_DONE; start = end, ops->next(intern, intern->flags TSRMLS_CC), end = ops->offset(intern, intern->flags, NULL TSRMLS_CC)) { if (end > start) { add_next_index_unicodel(return_value, intern->text + start, end - start, 1); } else { add_next_index_unicodel(return_value, intern->text + end, start - end, 1); } } } /* }}} */ /* {{{ TextIterator function entry table */ static zend_function_entry text_iterator_funcs[] = { PHP_ME(TextIterator, __construct, NULL, ZEND_ACC_PUBLIC) /* Iterator interface methods */ PHP_ME(TextIterator, current, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, next, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, key, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, valid, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, rewind, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, offset, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, previous, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, last, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, following, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, preceding, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, isBoundary, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, getAll, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, getRuleStatus, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, getRuleStatusArray, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, getAvailableLocales, NULL, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) PHP_MALIAS(TextIterator, first, rewind, NULL, ZEND_ACC_PUBLIC) {NULL, NULL, NULL} }; /* }}} */ void php_register_unicode_iterators(TSRMLS_D) { zend_class_entry ce; INIT_CLASS_ENTRY(ce, "TextIterator", text_iterator_funcs); text_iterator_ce = zend_register_internal_class(&ce TSRMLS_CC); text_iterator_ce->create_object = text_iterator_new; text_iterator_ce->get_iterator = text_iter_get_iterator; text_iterator_ce->ce_flags |= ZEND_ACC_FINAL_CLASS; zend_class_implements(text_iterator_ce TSRMLS_CC, 1, zend_ce_iterator); INIT_CLASS_ENTRY(ce, "ReverseTextIterator", text_iterator_funcs); rev_text_iterator_ce = zend_register_internal_class(&ce TSRMLS_CC); rev_text_iterator_ce->create_object = text_iterator_new; rev_text_iterator_ce->get_iterator = text_iter_get_iterator; rev_text_iterator_ce->ce_flags |= ZEND_ACC_FINAL_CLASS; zend_class_implements(rev_text_iterator_ce TSRMLS_CC, 1, zend_ce_iterator); zend_declare_class_constant_long(text_iterator_ce, "CODE_POINT", sizeof("CODE_POINT")-1, ITER_CODE_POINT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "COMB_SEQUENCE", sizeof("COMB_SEQUENCE")-1, ITER_COMB_SEQUENCE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "CHARACTER", sizeof("CHARACTER")-1, ITER_CHARACTER TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "DONE", sizeof("DONE")-1, UBRK_DONE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_NONE", sizeof("WORD_NONE")-1, UBRK_WORD_NONE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_NONE_LIMIT", sizeof("WORD_NONE_LIMIT")-1, UBRK_WORD_NONE_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_NUMBER", sizeof("WORD_NUMBER")-1, UBRK_WORD_NUMBER TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_NUMBER_LIMIT", sizeof("WORD_NUMBER_LIMIT")-1, UBRK_WORD_NUMBER_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_LETTER", sizeof("WORD_LETTER")-1, UBRK_WORD_LETTER TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_LETTER_LIMIT", sizeof("WORD_LETTER_LIMIT")-1, UBRK_WORD_LETTER_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_KANA", sizeof("WORD_KANA")-1, UBRK_WORD_KANA TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_KANA_LIMIT", sizeof("WORD_KANA_LIMIT")-1, UBRK_WORD_KANA_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_IDEO", sizeof("WORD_IDEO")-1, UBRK_WORD_IDEO TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "WORD_IDEO_LIMIT", sizeof("WORD_IDEO_LIMIT")-1, UBRK_WORD_IDEO_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE_SOFT", sizeof("LINE_SOFT")-1, UBRK_LINE_SOFT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE_SOFT_LIMIT", sizeof("LINE_SOFT_LIMIT")-1, UBRK_LINE_SOFT_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE_HARD", sizeof("LINE_HARD")-1, UBRK_LINE_HARD TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE_HARD_LIMIT", sizeof("LINE_HARD_LIMIT")-1, UBRK_LINE_HARD_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_TERM", sizeof("SENTENCE_TERM")-1, UBRK_SENTENCE_TERM TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_TERM_LIMIT", sizeof("SENTENCE_TERM_LIMIT")-1, UBRK_SENTENCE_TERM_LIMIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_SEP", sizeof("SENTENCE_SEP")-1, UBRK_SENTENCE_SEP TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_SEP_LIMIT", sizeof("SENTENCE_SEP_LIMIT")-1, UBRK_SENTENCE_SEP_LIMIT TSRMLS_CC); } /* * Local variables: * tab-width: 4 * c-basic-offset: 4 * End: * vim600: sw=4 ts=4 fdm=marker * vim<600: sw=4 ts=4 */