diff --git a/ext/tidy/README b/ext/tidy/README index 08bb1a95365..2d4e015176f 100644 --- a/ext/tidy/README +++ b/ext/tidy/README @@ -1,7 +1,7 @@ README FOR ext/tidy by John Coggeshall -Tidy Version: 0.5b +Tidy Version: 0.7b Tidy is an extension based on Libtidy (http://tidy.sf.net/) and allows a PHP developer to clean, repair, and traverse HTML, XHTML, and XML documents -- including ones with @@ -19,55 +19,55 @@ then recompile libtidy. The Tidy extension has two separate APIs, one for general parsing, cleaning, and repairing and another for document traversal. The general API is provided below: - tidy_create() Initialize and return a tidy document resource - tidy_parse_file($tidy, $file) Parse the document stored in $file - tidy_parse_string($tidy, $str) Parse the string stored in $str + tidy_create() Reinitialize the tidy engine + tidy_parse_file($file) Parse the document stored in $file + tidy_parse_string($str) Parse the string stored in $str - tidy_clean_repair($tidy) Clean and repair the document - tidy_diagnose($tidy) Diagnose a parsed document + tidy_clean_repair() Clean and repair the document + tidy_diagnose() Diagnose a parsed document - tidy_setopt($tidy, $opt, $val) Set a configuration option $opt to $val - tidy_getopt($tidy, $opt) Retrieve a configuration option + tidy_setopt($opt, $val) Set a configuration option $opt to $val + tidy_getopt($opt) Retrieve a configuration option - ** note: $opt is a string representing the option. Right now the only - source of these options is the LibTidy source.. eventually I'll document - them offically -- see the src/config.c file in the tidy source ** + ** note: $opt is a string representing the option. Although no formal + documentation yet exists for PHP, you can find a description of many + of them at http://www.w3.org/People/Raggett/tidy/ and a list of supported + options in the phpinfo(); output** - tidy_get_output($tidy) Return the cleaned tidy HTML as a string - tidy_get_error_buffer($tidy) Return a log of the errors and warnings + tidy_get_output() Return the cleaned tidy HTML as a string + tidy_get_error_buffer() Return a log of the errors and warnings returned by tidy tidy_get_release() Return the Libtidy release date - tidy_get_status($tidy) Return the status of the document - tidy_get_html_ver($tidy) Return the major HTML version detected for + tidy_get_status() Return the status of the document + tidy_get_html_ver() Return the major HTML version detected for the document; - tidy_is_xhtml($tidy) Determines if the document is XHTML - tidy_is_xml($tidy) Determines if the document is a generic XML + tidy_is_xhtml() Determines if the document is XHTML + tidy_is_xml() Determines if the document is a generic XML - tidy_error_count($tidy) Returns the number of errors in the document - tidy_warning_count($tidy) Returns the number of warnings in the document - tidy_access_count($tidy) Returns the number of accessibility-related + tidy_error_count() Returns the number of errors in the document + tidy_warning_count() Returns the number of warnings in the document + tidy_access_count() Returns the number of accessibility-related warnings in the document. - tidy_config_count($tidy) Returns the number of configuration errors found + tidy_config_count() Returns the number of configuration errors found - tidy_load_config($tidy, $file) Loads the specified configuration file - tidY_load_config_enc($tidy, - $file, + tidy_load_config($file) Loads the specified configuration file + tidY_load_config_enc($file, $enc) Loads the specified config file using the specified character encoding - tidy_set_encoding($tidy, $enc) Sets the current character encoding for the document - tidy_save_config($tidy, $file) Saves the current config to $file + tidy_set_encoding($enc) Sets the current character encoding for the document + tidy_save_config($file) Saves the current config to $file Beyond these general-purpose API functions, Tidy also supports the following functions which are used to retrieve an object for document traversal: - tidy_get_root($tidy) Returns an object starting at the root of the + tidy_get_root() Returns an object starting at the root of the document - tidy_get_head($tidy) Returns an object starting at the tag - tidy_get_html($tidy) Returns an object starting at the tag - tidy_get_body($tidy) Returns an object starting at the tag + tidy_get_head() Returns an object starting at the tag + tidy_get_html() Returns an object starting at the tag + tidy_get_body() Returns an object starting at the tag All Navigation of the specified document is done via the PHP5 object constructs. There are two types of objects which Tidy can create. The first is TidyNode, which @@ -82,18 +82,12 @@ class TidyNode { public $type; // type of node (text, php, asp, etc.) public $id; // id of node (i.e. TIDY_TAG_HEAD) - public $line; // line # of node in source - public $column; // column # of node in source - - public $html_ver; // HTML version (0,1,2,3,4) - - public $attribs; // an array of attributes (see TidyAttr) - public $children; // an array of child nodes + public function attributes(); // an array of attributes (see TidyAttr) + public function children(); // an array of child nodes function has_siblings(); // any sibling nodes? function has_children(); // any child nodes? - function has_parent(); // have a parent? - + function is_comment(); // is node a comment? function is_xhtml(); // is document XHTML? function is_xml(); // is document generic XML (not HTML/XHTML) @@ -106,45 +100,12 @@ class TidyNode { function next(); // returns next node function prev(); // returns prev node - function parent(); // returns parent node - function child(); // returns first child node - + /* Searches for a particular attribute in the current node based on node ID. If found returns a TidyAttr object for it */ - function get_attr_type($attr_id); + function get_attr($attr_id); /* - - NOT YET IMPLEMENTED - - Recursively traverses the tree from the current node and returns - an array of attributes matching the node ID/attr ID pair - - Useful for pulling out things like links: - foreach($body->fetch_attrs(TIDY_TAG_A, TIDY_ATTR_HREF) as $link) { - echo "Link : {$link->value}\n"; - } - */ - - function fetch_attrs($node_id, $attr_id); - - /* - - NOT YET IMPLEMENTED - - Recursively traverses the tree from the current node and returns - an array of nodes matching the node ID - - Useful for pulling out tables, etc (echos the HTML for every - block) - - foreach($body->fetch_nodes(TIDY_TAG_TABLE) as $table) { - - echo $table->value; - - } - */ - function fetch_nodes($node_id) } class TidyAttr { @@ -153,11 +114,9 @@ class TidyAttr { public $value; // attribute value public $id; // attribute id i.e. TIDY_ATTR_HREF - function next(); // returns next attribute in tag - function tag(); // returns the tag node associated with attribute } Examples of using these objects to navigate the tree can be found in the examples/ directory (I suggest looking at urlgrab.php and dumpit.php) -E-mail thoughts, suggestions, patches, etc. to \ No newline at end of file +E-mail thoughts, suggestions, patches, etc. to diff --git a/ext/tidy/TODO b/ext/tidy/TODO index f844b6ce82d..699c207dcb4 100644 --- a/ext/tidy/TODO +++ b/ext/tidy/TODO @@ -1,5 +1,3 @@ TODO - - Implement fetch_attr(), fetch_node() methods - - Fix any memleaks - - Fix Win32 crashes + - Implement get_nodes() method diff --git a/ext/tidy/examples/cleanhtml.php b/ext/tidy/examples/cleanhtml.php index c949a0cfc2d..9d054cda4fd 100644 --- a/ext/tidy/examples/cleanhtml.php +++ b/ext/tidy/examples/cleanhtml.php @@ -12,26 +12,24 @@ * */ - $tidy = tidy_create(); - if(!isset($_SERVER['argv'][1])) { $data = file_get_contents("php://stdin"); - tidy_parse_string($tidy, $data); + tidy_parse_string($data); } else { - tidy_parse_file($tidy, $_SERVER['argv'][1]); + tidy_parse_file($_SERVER['argv'][1]); } - tidy_clean_repair($tidy); + tidy_clean_repair(); - if(tidy_warning_count($tidy) || - tidy_error_count($tidy)) { + if(tidy_warning_count() || + tidy_error_count()) { echo "\n\nThe following errors or warnings occured:\n"; - echo tidy_get_error_buffer($tidy); + echo tidy_get_error_buffer(); echo "\n"; } - echo tidy_get_output($tidy); + echo tidy_get_output(); ?> diff --git a/ext/tidy/examples/dumpit.php b/ext/tidy/examples/dumpit.php index 46d307d7049..e0cda5e904b 100644 --- a/ext/tidy/examples/dumpit.php +++ b/ext/tidy/examples/dumpit.php @@ -10,15 +10,13 @@ * Usage; php dumpit.php */ - - $tidy = tidy_create(); - tidy_parse_file($tidy, $_SERVER['argv'][1]); + tidy_parse_file($_SERVER['argv'][1]); /* Optionally you can do this here if you want to fix up the document */ - /* tidy_clean_repair($tidy); */ + /* tidy_clean_repair(); */ - $tree = tidy_get_root($tidy); + $tree = tidy_get_root(); dump_tree($tree); echo "\n"; @@ -70,12 +68,12 @@ } /* Any attributes on this node? */ - if(count($node->attribs)) { + if(count($node->attributes())) { do_leaf(" |\n", $indent); do_leaf(" +---- Attributes\n", $indent); /* Cycle through the attributes and display them and their values. */ - foreach($node->attribs as $attrib) { + foreach($node->attributes() as $attrib) { do_leaf(" +--{$attrib->name}\n", $indent); do_leaf(" | +-- Value: {$attrib->value}\n", $indent); } @@ -83,7 +81,7 @@ /* Recurse along the children to generate the remaining nodes */ if($node->has_children()) { - foreach($node->children as $child) { + foreach($node->children() as $child) { dump_tree($child, $indent + 3); } } diff --git a/ext/tidy/examples/urlgrab.php b/ext/tidy/examples/urlgrab.php index 63a2875a798..7896792ea58 100644 --- a/ext/tidy/examples/urlgrab.php +++ b/ext/tidy/examples/urlgrab.php @@ -11,18 +11,15 @@ * Usage: php urlgrab.php * */ - - /* Create a Tidy Resource */ - $tidy = tidy_create(); - + /* Parse the document */ - tidy_parse_file($tidy, $_SERVER['argv'][1]); + tidy_parse_file($_SERVER['argv'][1]); /* Fix up the document */ - tidy_clean_repair($tidy); + tidy_clean_repair(); /* Get an object representing everything from the tag in */ - $html = tidy_get_html($tidy); + $html = tidy_get_html(); /* Traverse the document tree */ print_r(get_links($html)); @@ -33,7 +30,7 @@ /* Check to see if we are on an tag or not */ if($node->id == TIDY_TAG_A) { /* If we are, find the HREF attribute */ - $attrib = $node->get_attr_type(TIDY_ATTR_HREF); + $attrib = $node->get_attr(TIDY_ATTR_HREF); if($attrib) { /* Add the value of the HREF attrib to $urls */ $urls[] = $attrib->value; @@ -45,7 +42,7 @@ if($node->has_children()) { /* Traverse down each child recursively */ - foreach($node->children as $child) { + foreach($node->children() as $child) { /* Append the results from recursion to $urls */ foreach(get_links($child) as $url) { diff --git a/ext/tidy/php_tidy.h b/ext/tidy/php_tidy.h index 0e077aac368..e170d37086e 100644 --- a/ext/tidy/php_tidy.h +++ b/ext/tidy/php_tidy.h @@ -95,6 +95,33 @@ extern zend_module_entry tidy_module_entry; obj = (PHPTidyObj *)zend_object_store_get_object(object TSRMLS_CC); \ } +#define INSTANCIATE_NODE(_zval, _container, _node) \ + tidy_instanciate(tidy_ce_node, _zval TSRMLS_CC); \ + _container = (PHPTidyObj *) zend_object_store_get_object(_zval TSRMLS_CC); \ + _container->node = _node; \ + _container->attr = NULL; \ + _container->type = is_node; \ + tidy_add_default_properities(_container, is_node TSRMLS_CC); + +#define INSTANCIATE_ATTR(_zval, _container, _attr) \ + tidy_instanciate(tidy_ce_attr, _zval TSRMLS_CC); \ + _container = (PHPTidyObj *) zend_object_store_get_object(_zval TSRMLS_CC); \ + _container->node = NULL; \ + _container->attr = _attr; \ + _container->type = is_attr; \ + tidy_add_default_properities(_container, is_attr TSRMLS_CC); + +#define PHP_NODE_METHOD_IS_TYPE(_type, _const) \ +PHP_NODE_METHOD(is_ ##_type) \ +{ \ + GET_THIS_CONTAINER(); \ + if(tidyNodeGetType(obj->node) == _const) {\ + RETURN_TRUE; \ + } else { \ + RETURN_FALSE; \ + } \ +} + typedef enum { is_node, is_attr diff --git a/ext/tidy/tests/002.phpt b/ext/tidy/tests/002.phpt index b28b28410ad..83456091f70 100644 --- a/ext/tidy/tests/002.phpt +++ b/ext/tidy/tests/002.phpt @@ -7,12 +7,9 @@ tidy_parse_string() --INI-- --FILE-- "); - tidy_parse_string($tidy, ""); - - echo tidy_get_output($tidy); + echo tidy_get_output(); ?> --EXPECT-- diff --git a/ext/tidy/tests/003.phpt b/ext/tidy/tests/003.phpt index 289198c5382..b008acecdb3 100644 --- a/ext/tidy/tests/003.phpt +++ b/ext/tidy/tests/003.phpt @@ -8,12 +8,10 @@ tidy_clean_repair() --FILE-- "); - tidy_clean_repair($tidy); + tidy_parse_string(""); + tidy_clean_repair(); - echo tidy_get_output($tidy); + echo tidy_get_output(); ?> --EXPECT-- diff --git a/ext/tidy/tests/004.phpt b/ext/tidy/tests/004.phpt index 299c191dc6b..ed60a39b272 100644 --- a/ext/tidy/tests/004.phpt +++ b/ext/tidy/tests/004.phpt @@ -7,12 +7,9 @@ tidy_diagnose() --INI-- --FILE-- "); - tidy_diagnose($tidy); - echo tidy_get_error_buffer($tidy); + tidy_parse_string(""); + tidy_diagnose(); + echo tidy_get_error_buffer(); ?> --EXPECT-- diff --git a/ext/tidy/tests/005.phpt b/ext/tidy/tests/005.phpt index c1ee50713c3..d69a726c8f2 100644 --- a/ext/tidy/tests/005.phpt +++ b/ext/tidy/tests/005.phpt @@ -8,11 +8,9 @@ tidy_parse_file() --FILE-- --EXPECT-- diff --git a/ext/tidy/tests/006.phpt b/ext/tidy/tests/006.phpt index 37f3ebb899a..7ea28e79c51 100644 --- a/ext/tidy/tests/006.phpt +++ b/ext/tidy/tests/006.phpt @@ -8,11 +8,9 @@ Verbose tidy_get_error_buffer() --FILE-- "); - tidy_parse_string($tidy, ""); - - echo tidy_get_error_buffer($tidy, true); + echo tidy_get_error_buffer(true); ?> --EXPECT-- diff --git a/ext/tidy/tests/007.phpt b/ext/tidy/tests/007.phpt index f81853a29e8..9987677df6a 100644 --- a/ext/tidy/tests/007.phpt +++ b/ext/tidy/tests/007.phpt @@ -7,23 +7,22 @@ Verbose tidy_setopt() / tidy_getopt() --INI-- --FILE-- --EXPECT-- Current Value of 'tidy-mark': bool(false) diff --git a/ext/tidy/tidy.c b/ext/tidy/tidy.c index c9563b52013..22111d8b72a 100644 --- a/ext/tidy/tidy.c +++ b/ext/tidy/tidy.c @@ -238,7 +238,7 @@ static char *php_tidy_file_to_mem(char *filename, zend_bool use_include_path TSR return data; } -static void php_tidy_quick_repair(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_file) +static void php_tidy_quick_repair(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_file TSRMLS_DC) { char *data=NULL, *cfg_file=NULL, *arg1; int cfg_file_len, arg1_len; @@ -296,9 +296,10 @@ PHP_MINIT_FUNCTION(tidy) REGISTER_TIDY_CLASS(node, NULL); REGISTER_TIDY_CLASS(attr, NULL); REGISTER_TIDY_CLASS(exception, zend_exception_get_default()); -#endif + tidy_object_handlers_node.get_class_entry = tidy_get_ce_node; tidy_object_handlers_attr.get_class_entry = tidy_get_ce_attr; +#endif _php_tidy_register_tags(INIT_FUNC_ARGS_PASSTHRU); _php_tidy_register_attributes(INIT_FUNC_ARGS_PASSTHRU); @@ -485,7 +486,7 @@ PHP_FUNCTION(tidy_clean_repair) Repair a string using an optionally provided configuration file */ PHP_FUNCTION(tidy_repair_string) { - php_tidy_quick_repair(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); + php_tidy_quick_repair(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 TSRMLS_CC); } /* }}} */ @@ -1085,7 +1086,6 @@ PHP_FUNCTION(tidy_get_body) tidy_add_default_properities(obj, is_node TSRMLS_CC); } /* }}} */ -#endif /* {{{ proto void tidy_node::tidy_node() Constructor. */ @@ -1104,40 +1104,23 @@ PHP_NODE_METHOD(attributes) GET_THIS_CONTAINER(); tempattr = tidyAttrFirst(obj->node); - array_init(return_value); - + if(tempattr) { + array_init(return_value); - MAKE_STD_ZVAL(object); - tidy_instanciate(tidy_ce_node, object TSRMLS_CC); - - objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC); - objtemp->node = NULL; - objtemp->attr = tempattr; - objtemp->type = is_attr; - - tidy_add_default_properities(objtemp, is_attr TSRMLS_CC); - - add_next_index_zval(return_value, object); - object=NULL; - - while((tempattr = tidyAttrNext(tempattr))) { - MAKE_STD_ZVAL(object); - tidy_instanciate(tidy_ce_node, object TSRMLS_CC); - objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC); - objtemp->node = NULL; - objtemp->attr = tempattr; - objtemp->type = is_attr; - - tidy_add_default_properities(objtemp, is_attr TSRMLS_CC); - + do { + + MAKE_STD_ZVAL(object); + INSTANCIATE_ATTR(object, objtemp, tempattr); add_next_index_zval(return_value, object); - object=NULL; - } + + } while((tempattr = tidyAttrNext(tempattr))); } } /* }}} */ + + /* {{{ proto tidy_node tidy_node::children() Returns an array of child nodes */ PHP_NODE_METHOD(children) @@ -1148,36 +1131,16 @@ PHP_NODE_METHOD(children) GET_THIS_CONTAINER(); tempnode = tidyGetChild(obj->node); - array_init(return_value); - + if(tempnode) { + array_init(return_value); + do { - MAKE_STD_ZVAL(object); - tidy_instanciate(tidy_ce_node, object TSRMLS_CC); - - objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC); - objtemp->node = tempnode; - objtemp->attr = NULL; - objtemp->type = is_node; - - tidy_add_default_properities(objtemp, is_node TSRMLS_CC); - - add_next_index_zval(return_value, object); - object=NULL; - - while((tempnode = tidyGetNext(tempnode))) { - MAKE_STD_ZVAL(object); - tidy_instanciate(tidy_ce_node, object TSRMLS_CC); - objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC); - objtemp->node = tempnode; - objtemp->attr = NULL; - objtemp->type = is_node; - - tidy_add_default_properities(objtemp, is_node TSRMLS_CC); - + MAKE_STD_ZVAL(object); + INSTANCIATE_NODE(object, objtemp, tempnode); add_next_index_zval(return_value, object); - object=NULL; - } + + } while((tempnode = tidyGetNext(tempnode))); } } /* }}} */ @@ -1231,16 +1194,12 @@ PHP_NODE_METHOD(is_comment) PHP_NODE_METHOD(is_html) { GET_THIS_CONTAINER(); - switch(tidyNodeGetType(obj->node)) { - case TidyNode_Start: - case TidyNode_End: - case TidyNode_StartEnd: - RETURN_TRUE; - break; - default: - RETURN_FALSE; - break; + + if(tidyNodeGetType(obj->node) & (TidyNode_Start | TidyNode_End | TidyNode_StartEnd)) { + RETURN_TRUE; } + RETURN_FALSE; + } /* }}} */ @@ -1269,7 +1228,7 @@ PHP_NODE_METHOD(is_xml) } } /* }}} */ - + /* {{{ proto boolean tidy_node::is_text() Returns true if this node represents text (no markup) */ PHP_NODE_METHOD(is_text) @@ -1296,7 +1255,7 @@ PHP_NODE_METHOD(is_jste) } } /* }}} */ - + /* {{{ proto boolean tidy_node::is_asp() Returns true if this node is ASP */ PHP_NODE_METHOD(is_asp) @@ -1372,7 +1331,7 @@ PHP_NODE_METHOD(prev) PHP_NODE_METHOD(get_attr) { TidyAttr tempattr; - int param; + long param; GET_THIS_CONTAINER(); if(ZEND_NUM_ARGS() != 1) { @@ -1385,11 +1344,8 @@ PHP_NODE_METHOD(get_attr) RETURN_FALSE; } - for(tempattr = tidyAttrFirst(obj->node); - tempattr; - tempattr = tidyAttrNext(tempattr)) { + for(tempattr = tidyAttrFirst(obj->node); tempattr; tempattr = tidyAttrNext(tempattr)) { - fprintf(stderr, "Comparing %d with %d\n", tidyAttrGetId(tempattr), param); if(tidyAttrGetId(tempattr) == param) { tidy_instanciate(tidy_ce_node, return_value TSRMLS_CC); @@ -1413,6 +1369,8 @@ PHP_NODE_METHOD(get_nodes) /* TODO */ } /* }}} */ + +#endif /* ZEND_ENGINE_2 */ void _php_tidy_register_nodetypes(INIT_FUNC_ARGS) {