mirror of
https://github.com/php/php-src.git
synced 2024-09-21 18:07:23 +00:00
Optimizations for HTML 5 loading (#12896)
* Fix inverted NULL and add dictionary * Avoid useless error processing if no reporting is set * Avoid double work while processing attributes and use fast text instantiation
This commit is contained in:
parent
90eb5679d2
commit
a9064816db
@ -74,6 +74,26 @@ static const xmlChar *get_libxml_namespace_href(uintptr_t lexbor_namespace)
|
||||
}
|
||||
}
|
||||
|
||||
static xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
|
||||
{
|
||||
if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
|
||||
/* See xmlSAX2TextNode() in libxml2 */
|
||||
xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
|
||||
if (UNEXPECTED(lxml_text == NULL)) {
|
||||
return NULL;
|
||||
}
|
||||
memset(lxml_text, 0, sizeof(*lxml_text));
|
||||
lxml_text->name = xmlStringText;
|
||||
lxml_text->type = XML_TEXT_NODE;
|
||||
lxml_text->doc = lxml_doc;
|
||||
lxml_text->content = (xmlChar *) &lxml_text->properties;
|
||||
memcpy(lxml_text->content, data, data_length);
|
||||
return lxml_text;
|
||||
} else {
|
||||
return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
|
||||
}
|
||||
}
|
||||
|
||||
static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
|
||||
lxb_dom_node_t *start_node,
|
||||
xmlDocPtr lxml_doc,
|
||||
@ -130,14 +150,52 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
|
||||
);
|
||||
}
|
||||
|
||||
for (lxb_dom_attr_t *attr = element->last_attr; attr != NULL; attr = attr->prev) {
|
||||
lexbor_libxml2_bridge_work_list_item_push(
|
||||
&work_list,
|
||||
(lxb_dom_node_t *) attr,
|
||||
entering_namespace,
|
||||
lxml_element,
|
||||
current_lxml_ns
|
||||
);
|
||||
xmlAttrPtr last_added_attr = NULL;
|
||||
for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
|
||||
/* Same namespace remark as for elements */
|
||||
size_t local_name_length, value_length;
|
||||
const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
|
||||
const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
|
||||
|
||||
if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
|
||||
goto out;
|
||||
}
|
||||
|
||||
xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
|
||||
if (UNEXPECTED(lxml_attr == NULL)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(lxml_attr, 0, sizeof(xmlAttr));
|
||||
lxml_attr->type = XML_ATTRIBUTE_NODE;
|
||||
lxml_attr->parent = lxml_element;
|
||||
lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
|
||||
lxml_attr->doc = lxml_doc;
|
||||
xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
|
||||
if (UNEXPECTED(lxml_text == NULL)) {
|
||||
xmlFreeProp(lxml_attr);
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
lxml_attr->children = lxml_attr->last = lxml_text;
|
||||
|
||||
if (last_added_attr == NULL) {
|
||||
lxml_element->properties = lxml_attr;
|
||||
} else {
|
||||
last_added_attr->next = lxml_attr;
|
||||
lxml_attr->prev = last_added_attr;
|
||||
}
|
||||
last_added_attr = lxml_attr;
|
||||
|
||||
/* xmlIsID does some other stuff too that is irrelevant here. */
|
||||
if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd') {
|
||||
xmlAddID(NULL, lxml_doc, value, lxml_attr);
|
||||
}
|
||||
|
||||
/* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
|
||||
}
|
||||
} else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
|
||||
lxb_dom_text_t *text = lxb_dom_interface_text(node);
|
||||
@ -147,26 +205,10 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
|
||||
goto out;
|
||||
}
|
||||
xmlNodePtr lxml_text;
|
||||
if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
|
||||
/* See xmlSAX2TextNode() in libxml2 */
|
||||
lxml_text = xmlMalloc(sizeof(*lxml_text));
|
||||
if (UNEXPECTED(lxml_text == NULL)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
memset(lxml_text, 0, sizeof(*lxml_text));
|
||||
lxml_text->name = xmlStringText;
|
||||
lxml_text->type = XML_TEXT_NODE;
|
||||
lxml_text->doc = lxml_doc;
|
||||
lxml_text->content = (xmlChar *) &lxml_text->properties;
|
||||
memcpy(lxml_text->content, data, data_length + 1 /* include '\0' */);
|
||||
} else {
|
||||
lxml_text = xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
|
||||
if (UNEXPECTED(lxml_text == NULL)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
|
||||
if (UNEXPECTED(lxml_text == NULL)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
xmlAddChild(lxml_parent, lxml_text);
|
||||
if (node->line >= USHRT_MAX) {
|
||||
@ -192,20 +234,6 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
|
||||
goto out;
|
||||
}
|
||||
/* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
|
||||
} else if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
|
||||
lxb_dom_attr_t *attr = lxb_dom_interface_attr(node);
|
||||
do {
|
||||
/* Same namespace remark as for elements */
|
||||
const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, NULL);
|
||||
const lxb_char_t *value = lxb_dom_attr_value(attr, NULL);
|
||||
xmlAttrPtr lxml_attr = xmlSetNsProp(lxml_parent, NULL, local_name, value);
|
||||
if (UNEXPECTED(lxml_attr == NULL)) {
|
||||
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
goto out;
|
||||
}
|
||||
attr = attr->next;
|
||||
/* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
|
||||
} while (attr);
|
||||
} else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
|
||||
lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
|
||||
xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
|
||||
@ -247,15 +275,19 @@ lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
|
||||
{
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL);
|
||||
if (UNEXPECTED(!lxml_doc)) {
|
||||
return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
}
|
||||
#else
|
||||
/* If HTML support is not enabled, then htmlNewDocNoDtD() is not available.
|
||||
* This code mimics the behaviour. */
|
||||
xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0");
|
||||
lxml_doc->type = XML_HTML_DOCUMENT_NODE;
|
||||
#endif
|
||||
if (!lxml_doc) {
|
||||
if (UNEXPECTED(!lxml_doc)) {
|
||||
return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
|
||||
}
|
||||
lxml_doc->type = XML_HTML_DOCUMENT_NODE;
|
||||
#endif
|
||||
lxml_doc->dict = xmlDictCreate();
|
||||
lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
|
||||
lxb_dom_interface_node(document)->last_child,
|
||||
lxml_doc,
|
||||
|
@ -487,8 +487,10 @@ static bool dom_process_parse_chunk(
|
||||
if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
|
||||
return false;
|
||||
}
|
||||
lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
|
||||
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
|
||||
if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
|
||||
lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
|
||||
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
|
||||
}
|
||||
application_data->current_total_offset += input_buffer_length;
|
||||
application_data->cache_tokenizer.last_offset = 0;
|
||||
return true;
|
||||
|
Loading…
Reference in New Issue
Block a user