Updated test cases and examples and cleaned up the new OO code so it will

be easier to maintain.
This commit is contained in:
John Coggeshall 2003-09-22 18:40:38 +00:00
parent 6b567f80c7
commit d8eeb8e286
13 changed files with 134 additions and 212 deletions

View File

@ -1,7 +1,7 @@
README FOR ext/tidy by John Coggeshall <john@php.net>
Tidy Version: 0.5b
Tidy Version: 0.7b
Tidy is an extension based on Libtidy (http://tidy.sf.net/) and allows a PHP developer
to clean, repair, and traverse HTML, XHTML, and XML documents -- including ones with
@ -19,55 +19,55 @@ then recompile libtidy.
The Tidy extension has two separate APIs, one for general parsing, cleaning, and
repairing and another for document traversal. The general API is provided below:
tidy_create() Initialize and return a tidy document resource
tidy_parse_file($tidy, $file) Parse the document stored in $file
tidy_parse_string($tidy, $str) Parse the string stored in $str
tidy_create() Reinitialize the tidy engine
tidy_parse_file($file) Parse the document stored in $file
tidy_parse_string($str) Parse the string stored in $str
tidy_clean_repair($tidy) Clean and repair the document
tidy_diagnose($tidy) Diagnose a parsed document
tidy_clean_repair() Clean and repair the document
tidy_diagnose() Diagnose a parsed document
tidy_setopt($tidy, $opt, $val) Set a configuration option $opt to $val
tidy_getopt($tidy, $opt) Retrieve a configuration option
tidy_setopt($opt, $val) Set a configuration option $opt to $val
tidy_getopt($opt) Retrieve a configuration option
** note: $opt is a string representing the option. Right now the only
source of these options is the LibTidy source.. eventually I'll document
them offically -- see the src/config.c file in the tidy source **
** note: $opt is a string representing the option. Although no formal
documentation yet exists for PHP, you can find a description of many
of them at http://www.w3.org/People/Raggett/tidy/ and a list of supported
options in the phpinfo(); output**
tidy_get_output($tidy) Return the cleaned tidy HTML as a string
tidy_get_error_buffer($tidy) Return a log of the errors and warnings
tidy_get_output() Return the cleaned tidy HTML as a string
tidy_get_error_buffer() Return a log of the errors and warnings
returned by tidy
tidy_get_release() Return the Libtidy release date
tidy_get_status($tidy) Return the status of the document
tidy_get_html_ver($tidy) Return the major HTML version detected for
tidy_get_status() Return the status of the document
tidy_get_html_ver() Return the major HTML version detected for
the document;
tidy_is_xhtml($tidy) Determines if the document is XHTML
tidy_is_xml($tidy) Determines if the document is a generic XML
tidy_is_xhtml() Determines if the document is XHTML
tidy_is_xml() Determines if the document is a generic XML
tidy_error_count($tidy) Returns the number of errors in the document
tidy_warning_count($tidy) Returns the number of warnings in the document
tidy_access_count($tidy) Returns the number of accessibility-related
tidy_error_count() Returns the number of errors in the document
tidy_warning_count() Returns the number of warnings in the document
tidy_access_count() Returns the number of accessibility-related
warnings in the document.
tidy_config_count($tidy) Returns the number of configuration errors found
tidy_config_count() Returns the number of configuration errors found
tidy_load_config($tidy, $file) Loads the specified configuration file
tidY_load_config_enc($tidy,
$file,
tidy_load_config($file) Loads the specified configuration file
tidY_load_config_enc($file,
$enc) Loads the specified config file using the specified
character encoding
tidy_set_encoding($tidy, $enc) Sets the current character encoding for the document
tidy_save_config($tidy, $file) Saves the current config to $file
tidy_set_encoding($enc) Sets the current character encoding for the document
tidy_save_config($file) Saves the current config to $file
Beyond these general-purpose API functions, Tidy also supports the following
functions which are used to retrieve an object for document traversal:
tidy_get_root($tidy) Returns an object starting at the root of the
tidy_get_root() Returns an object starting at the root of the
document
tidy_get_head($tidy) Returns an object starting at the <HEAD> tag
tidy_get_html($tidy) Returns an object starting at the <HTML> tag
tidy_get_body($tidy) Returns an object starting at the <BODY> tag
tidy_get_head() Returns an object starting at the <HEAD> tag
tidy_get_html() Returns an object starting at the <HTML> tag
tidy_get_body() Returns an object starting at the <BODY> tag
All Navigation of the specified document is done via the PHP5 object constructs.
There are two types of objects which Tidy can create. The first is TidyNode, which
@ -82,18 +82,12 @@ class TidyNode {
public $type; // type of node (text, php, asp, etc.)
public $id; // id of node (i.e. TIDY_TAG_HEAD)
public $line; // line # of node in source
public $column; // column # of node in source
public $html_ver; // HTML version (0,1,2,3,4)
public $attribs; // an array of attributes (see TidyAttr)
public $children; // an array of child nodes
public function attributes(); // an array of attributes (see TidyAttr)
public function children(); // an array of child nodes
function has_siblings(); // any sibling nodes?
function has_children(); // any child nodes?
function has_parent(); // have a parent?
function is_comment(); // is node a comment?
function is_xhtml(); // is document XHTML?
function is_xml(); // is document generic XML (not HTML/XHTML)
@ -106,45 +100,12 @@ class TidyNode {
function next(); // returns next node
function prev(); // returns prev node
function parent(); // returns parent node
function child(); // returns first child node
/* Searches for a particular attribute in the current node based
on node ID. If found returns a TidyAttr object for it */
function get_attr_type($attr_id);
function get_attr($attr_id);
/*
NOT YET IMPLEMENTED
Recursively traverses the tree from the current node and returns
an array of attributes matching the node ID/attr ID pair
Useful for pulling out things like links:
foreach($body->fetch_attrs(TIDY_TAG_A, TIDY_ATTR_HREF) as $link) {
echo "Link : {$link->value}\n";
}
*/
function fetch_attrs($node_id, $attr_id);
/*
NOT YET IMPLEMENTED
Recursively traverses the tree from the current node and returns
an array of nodes matching the node ID
Useful for pulling out tables, etc (echos the HTML for every
<TABLE> block)
foreach($body->fetch_nodes(TIDY_TAG_TABLE) as $table) {
echo $table->value;
}
*/
function fetch_nodes($node_id)
}
class TidyAttr {
@ -153,11 +114,9 @@ class TidyAttr {
public $value; // attribute value
public $id; // attribute id i.e. TIDY_ATTR_HREF
function next(); // returns next attribute in tag
function tag(); // returns the tag node associated with attribute
}
Examples of using these objects to navigate the tree can be found in the examples/
directory (I suggest looking at urlgrab.php and dumpit.php)
E-mail thoughts, suggestions, patches, etc. to <john@php.net>
E-mail thoughts, suggestions, patches, etc. to <john@php.net>

View File

@ -1,5 +1,3 @@
TODO
- Implement fetch_attr(), fetch_node() methods
- Fix any memleaks
- Fix Win32 crashes
- Implement get_nodes() method

View File

@ -12,26 +12,24 @@
*
*/
$tidy = tidy_create();
if(!isset($_SERVER['argv'][1])) {
$data = file_get_contents("php://stdin");
tidy_parse_string($tidy, $data);
tidy_parse_string($data);
} else {
tidy_parse_file($tidy, $_SERVER['argv'][1]);
tidy_parse_file($_SERVER['argv'][1]);
}
tidy_clean_repair($tidy);
tidy_clean_repair();
if(tidy_warning_count($tidy) ||
tidy_error_count($tidy)) {
if(tidy_warning_count() ||
tidy_error_count()) {
echo "\n\nThe following errors or warnings occured:\n";
echo tidy_get_error_buffer($tidy);
echo tidy_get_error_buffer();
echo "\n";
}
echo tidy_get_output($tidy);
echo tidy_get_output();
?>

View File

@ -10,15 +10,13 @@
* Usage; php dumpit.php <filename>
*/
$tidy = tidy_create();
tidy_parse_file($tidy, $_SERVER['argv'][1]);
tidy_parse_file($_SERVER['argv'][1]);
/* Optionally you can do this here if you want to fix up the document */
/* tidy_clean_repair($tidy); */
/* tidy_clean_repair(); */
$tree = tidy_get_root($tidy);
$tree = tidy_get_root();
dump_tree($tree);
echo "\n";
@ -70,12 +68,12 @@
}
/* Any attributes on this node? */
if(count($node->attribs)) {
if(count($node->attributes())) {
do_leaf(" |\n", $indent);
do_leaf(" +---- Attributes\n", $indent);
/* Cycle through the attributes and display them and their values. */
foreach($node->attribs as $attrib) {
foreach($node->attributes() as $attrib) {
do_leaf(" +--{$attrib->name}\n", $indent);
do_leaf(" | +-- Value: {$attrib->value}\n", $indent);
}
@ -83,7 +81,7 @@
/* Recurse along the children to generate the remaining nodes */
if($node->has_children()) {
foreach($node->children as $child) {
foreach($node->children() as $child) {
dump_tree($child, $indent + 3);
}
}

View File

@ -11,18 +11,15 @@
* Usage: php urlgrab.php <file>
*
*/
/* Create a Tidy Resource */
$tidy = tidy_create();
/* Parse the document */
tidy_parse_file($tidy, $_SERVER['argv'][1]);
tidy_parse_file($_SERVER['argv'][1]);
/* Fix up the document */
tidy_clean_repair($tidy);
tidy_clean_repair();
/* Get an object representing everything from the <HTML> tag in */
$html = tidy_get_html($tidy);
$html = tidy_get_html();
/* Traverse the document tree */
print_r(get_links($html));
@ -33,7 +30,7 @@
/* Check to see if we are on an <A> tag or not */
if($node->id == TIDY_TAG_A) {
/* If we are, find the HREF attribute */
$attrib = $node->get_attr_type(TIDY_ATTR_HREF);
$attrib = $node->get_attr(TIDY_ATTR_HREF);
if($attrib) {
/* Add the value of the HREF attrib to $urls */
$urls[] = $attrib->value;
@ -45,7 +42,7 @@
if($node->has_children()) {
/* Traverse down each child recursively */
foreach($node->children as $child) {
foreach($node->children() as $child) {
/* Append the results from recursion to $urls */
foreach(get_links($child) as $url) {

View File

@ -95,6 +95,33 @@ extern zend_module_entry tidy_module_entry;
obj = (PHPTidyObj *)zend_object_store_get_object(object TSRMLS_CC); \
}
#define INSTANCIATE_NODE(_zval, _container, _node) \
tidy_instanciate(tidy_ce_node, _zval TSRMLS_CC); \
_container = (PHPTidyObj *) zend_object_store_get_object(_zval TSRMLS_CC); \
_container->node = _node; \
_container->attr = NULL; \
_container->type = is_node; \
tidy_add_default_properities(_container, is_node TSRMLS_CC);
#define INSTANCIATE_ATTR(_zval, _container, _attr) \
tidy_instanciate(tidy_ce_attr, _zval TSRMLS_CC); \
_container = (PHPTidyObj *) zend_object_store_get_object(_zval TSRMLS_CC); \
_container->node = NULL; \
_container->attr = _attr; \
_container->type = is_attr; \
tidy_add_default_properities(_container, is_attr TSRMLS_CC);
#define PHP_NODE_METHOD_IS_TYPE(_type, _const) \
PHP_NODE_METHOD(is_ ##_type) \
{ \
GET_THIS_CONTAINER(); \
if(tidyNodeGetType(obj->node) == _const) {\
RETURN_TRUE; \
} else { \
RETURN_FALSE; \
} \
}
typedef enum {
is_node,
is_attr

View File

@ -7,12 +7,9 @@ tidy_parse_string()
--INI--
--FILE--
<?php
$tidy = tidy_create();
tidy_parse_string("<HTML></HTML>");
tidy_parse_string($tidy, "<HTML></HTML>");
echo tidy_get_output($tidy);
echo tidy_get_output();
?>
--EXPECT--

View File

@ -8,12 +8,10 @@ tidy_clean_repair()
--FILE--
<?php
$tidy = tidy_create();
tidy_parse_string($tidy, "<HTML></HTML>");
tidy_clean_repair($tidy);
tidy_parse_string("<HTML></HTML>");
tidy_clean_repair();
echo tidy_get_output($tidy);
echo tidy_get_output();
?>
--EXPECT--

View File

@ -7,12 +7,9 @@ tidy_diagnose()
--INI--
--FILE--
<?php
$tidy = tidy_create();
tidy_parse_string($tidy, "<HTML></HTML>");
tidy_diagnose($tidy);
echo tidy_get_error_buffer($tidy);
tidy_parse_string("<HTML></HTML>");
tidy_diagnose();
echo tidy_get_error_buffer();
?>
--EXPECT--

View File

@ -8,11 +8,9 @@ tidy_parse_file()
--FILE--
<?php
$tidy = tidy_create();
tidy_parse_file("ext/tidy/tests/005.html");
tidy_parse_file($tidy, "ext/tidy/tests/005.html");
echo tidy_get_output($tidy);
echo tidy_get_output();
?>
--EXPECT--

View File

@ -8,11 +8,9 @@ Verbose tidy_get_error_buffer()
--FILE--
<?php
$tidy = tidy_create();
tidy_parse_string("<HTML><asd asdf></HTML>");
tidy_parse_string($tidy, "<HTML><asd asdf></HTML>");
echo tidy_get_error_buffer($tidy, true);
echo tidy_get_error_buffer(true);
?>
--EXPECT--

View File

@ -7,23 +7,22 @@ Verbose tidy_setopt() / tidy_getopt()
--INI--
--FILE--
<?php
$tidy = tidy_create();
echo "Current Value of 'tidy-mark': ";
var_dump(tidy_getopt($tidy, "tidy-mark"));
var_dump(tidy_getopt("tidy-mark"));
tidy_setopt($tidy, "tidy-mark", true);
echo "\nNew Value of 'tidy-mark': ";
var_dump(tidy_getopt($tidy, "tidy-mark"));
var_dump(tidy_getopt("tidy-mark"));
echo "Current Value of 'error-file': ";
var_dump(tidy_getopt($tidy, "error-file"));
var_dump(tidy_getopt("error-file"));
tidy_setopt($tidy, "error-file", "foobar");
echo "\nNew Value of 'error-file': ";
var_dump(tidy_getopt($tidy, "error-file"));
var_dump(tidy_getopt("error-file"));
echo "Current Value of 'tab-size': ";
var_dump(tidy_getopt($tidy, "tab-size"));
var_dump(tidy_getopt("tab-size"));
tidy_setopt($tidy, "tab-size", 10);
echo "\nNew Value of 'tab-size': ";
var_dump(tidy_getopt($tidy, "tab-size"));
var_dump(tidy_getopt("tab-size"));
?>
--EXPECT--
Current Value of 'tidy-mark': bool(false)

View File

@ -238,7 +238,7 @@ static char *php_tidy_file_to_mem(char *filename, zend_bool use_include_path TSR
return data;
}
static void php_tidy_quick_repair(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_file)
static void php_tidy_quick_repair(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_file TSRMLS_DC)
{
char *data=NULL, *cfg_file=NULL, *arg1;
int cfg_file_len, arg1_len;
@ -296,9 +296,10 @@ PHP_MINIT_FUNCTION(tidy)
REGISTER_TIDY_CLASS(node, NULL);
REGISTER_TIDY_CLASS(attr, NULL);
REGISTER_TIDY_CLASS(exception, zend_exception_get_default());
#endif
tidy_object_handlers_node.get_class_entry = tidy_get_ce_node;
tidy_object_handlers_attr.get_class_entry = tidy_get_ce_attr;
#endif
_php_tidy_register_tags(INIT_FUNC_ARGS_PASSTHRU);
_php_tidy_register_attributes(INIT_FUNC_ARGS_PASSTHRU);
@ -485,7 +486,7 @@ PHP_FUNCTION(tidy_clean_repair)
Repair a string using an optionally provided configuration file */
PHP_FUNCTION(tidy_repair_string)
{
php_tidy_quick_repair(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
php_tidy_quick_repair(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 TSRMLS_CC);
}
/* }}} */
@ -1085,7 +1086,6 @@ PHP_FUNCTION(tidy_get_body)
tidy_add_default_properities(obj, is_node TSRMLS_CC);
}
/* }}} */
#endif
/* {{{ proto void tidy_node::tidy_node()
Constructor. */
@ -1104,40 +1104,23 @@ PHP_NODE_METHOD(attributes)
GET_THIS_CONTAINER();
tempattr = tidyAttrFirst(obj->node);
array_init(return_value);
if(tempattr) {
array_init(return_value);
MAKE_STD_ZVAL(object);
tidy_instanciate(tidy_ce_node, object TSRMLS_CC);
objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC);
objtemp->node = NULL;
objtemp->attr = tempattr;
objtemp->type = is_attr;
tidy_add_default_properities(objtemp, is_attr TSRMLS_CC);
add_next_index_zval(return_value, object);
object=NULL;
while((tempattr = tidyAttrNext(tempattr))) {
MAKE_STD_ZVAL(object);
tidy_instanciate(tidy_ce_node, object TSRMLS_CC);
objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC);
objtemp->node = NULL;
objtemp->attr = tempattr;
objtemp->type = is_attr;
tidy_add_default_properities(objtemp, is_attr TSRMLS_CC);
do {
MAKE_STD_ZVAL(object);
INSTANCIATE_ATTR(object, objtemp, tempattr);
add_next_index_zval(return_value, object);
object=NULL;
}
} while((tempattr = tidyAttrNext(tempattr)));
}
}
/* }}} */
/* {{{ proto tidy_node tidy_node::children()
Returns an array of child nodes */
PHP_NODE_METHOD(children)
@ -1148,36 +1131,16 @@ PHP_NODE_METHOD(children)
GET_THIS_CONTAINER();
tempnode = tidyGetChild(obj->node);
array_init(return_value);
if(tempnode) {
array_init(return_value);
do {
MAKE_STD_ZVAL(object);
tidy_instanciate(tidy_ce_node, object TSRMLS_CC);
objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC);
objtemp->node = tempnode;
objtemp->attr = NULL;
objtemp->type = is_node;
tidy_add_default_properities(objtemp, is_node TSRMLS_CC);
add_next_index_zval(return_value, object);
object=NULL;
while((tempnode = tidyGetNext(tempnode))) {
MAKE_STD_ZVAL(object);
tidy_instanciate(tidy_ce_node, object TSRMLS_CC);
objtemp = (PHPTidyObj *) zend_object_store_get_object(object TSRMLS_CC);
objtemp->node = tempnode;
objtemp->attr = NULL;
objtemp->type = is_node;
tidy_add_default_properities(objtemp, is_node TSRMLS_CC);
MAKE_STD_ZVAL(object);
INSTANCIATE_NODE(object, objtemp, tempnode);
add_next_index_zval(return_value, object);
object=NULL;
}
} while((tempnode = tidyGetNext(tempnode)));
}
}
/* }}} */
@ -1231,16 +1194,12 @@ PHP_NODE_METHOD(is_comment)
PHP_NODE_METHOD(is_html)
{
GET_THIS_CONTAINER();
switch(tidyNodeGetType(obj->node)) {
case TidyNode_Start:
case TidyNode_End:
case TidyNode_StartEnd:
RETURN_TRUE;
break;
default:
RETURN_FALSE;
break;
if(tidyNodeGetType(obj->node) & (TidyNode_Start | TidyNode_End | TidyNode_StartEnd)) {
RETURN_TRUE;
}
RETURN_FALSE;
}
/* }}} */
@ -1269,7 +1228,7 @@ PHP_NODE_METHOD(is_xml)
}
}
/* }}} */
/* {{{ proto boolean tidy_node::is_text()
Returns true if this node represents text (no markup) */
PHP_NODE_METHOD(is_text)
@ -1296,7 +1255,7 @@ PHP_NODE_METHOD(is_jste)
}
}
/* }}} */
/* {{{ proto boolean tidy_node::is_asp()
Returns true if this node is ASP */
PHP_NODE_METHOD(is_asp)
@ -1372,7 +1331,7 @@ PHP_NODE_METHOD(prev)
PHP_NODE_METHOD(get_attr)
{
TidyAttr tempattr;
int param;
long param;
GET_THIS_CONTAINER();
if(ZEND_NUM_ARGS() != 1) {
@ -1385,11 +1344,8 @@ PHP_NODE_METHOD(get_attr)
RETURN_FALSE;
}
for(tempattr = tidyAttrFirst(obj->node);
tempattr;
tempattr = tidyAttrNext(tempattr)) {
for(tempattr = tidyAttrFirst(obj->node); tempattr; tempattr = tidyAttrNext(tempattr)) {
fprintf(stderr, "Comparing %d with %d\n", tidyAttrGetId(tempattr), param);
if(tidyAttrGetId(tempattr) == param) {
tidy_instanciate(tidy_ce_node, return_value TSRMLS_CC);
@ -1413,6 +1369,8 @@ PHP_NODE_METHOD(get_nodes)
/* TODO */
}
/* }}} */
#endif /* ZEND_ENGINE_2 */
void _php_tidy_register_nodetypes(INIT_FUNC_ARGS)
{