2003-08-01 00:22:43 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/*
|
|
|
|
* urlgrab.php
|
|
|
|
*
|
|
|
|
* A simple command-line utility to extract all of the URLS contained
|
|
|
|
* within <A HREF> tags from a document.
|
|
|
|
*
|
2004-01-03 05:23:24 +00:00
|
|
|
* NOTE: Only works with tidy for PHP 4.3.x, please see urlgrab5.php for tidy for PHP 5
|
|
|
|
*
|
2003-08-01 00:22:43 +00:00
|
|
|
* By: John Coggeshall <john@php.net>
|
|
|
|
*
|
|
|
|
* Usage: php urlgrab.php <file>
|
|
|
|
*
|
|
|
|
*/
|
2003-09-22 18:40:38 +00:00
|
|
|
|
2003-08-01 00:22:43 +00:00
|
|
|
/* Parse the document */
|
2003-09-22 18:40:38 +00:00
|
|
|
tidy_parse_file($_SERVER['argv'][1]);
|
2003-08-01 00:22:43 +00:00
|
|
|
|
|
|
|
/* Fix up the document */
|
2003-09-22 18:40:38 +00:00
|
|
|
tidy_clean_repair();
|
2003-08-01 00:22:43 +00:00
|
|
|
|
|
|
|
/* Get an object representing everything from the <HTML> tag in */
|
2003-09-22 18:40:38 +00:00
|
|
|
$html = tidy_get_html();
|
2003-08-01 00:22:43 +00:00
|
|
|
|
|
|
|
/* Traverse the document tree */
|
|
|
|
print_r(get_links($html));
|
|
|
|
|
|
|
|
function get_links($node) {
|
|
|
|
$urls = array();
|
|
|
|
|
|
|
|
/* Check to see if we are on an <A> tag or not */
|
|
|
|
if($node->id == TIDY_TAG_A) {
|
|
|
|
/* If we are, find the HREF attribute */
|
2003-09-22 18:40:38 +00:00
|
|
|
$attrib = $node->get_attr(TIDY_ATTR_HREF);
|
2003-08-01 00:22:43 +00:00
|
|
|
if($attrib) {
|
|
|
|
/* Add the value of the HREF attrib to $urls */
|
|
|
|
$urls[] = $attrib->value;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Are there any children? */
|
|
|
|
if($node->has_children()) {
|
|
|
|
|
|
|
|
/* Traverse down each child recursively */
|
2003-09-22 18:40:38 +00:00
|
|
|
foreach($node->children() as $child) {
|
2003-08-01 00:22:43 +00:00
|
|
|
|
|
|
|
/* Append the results from recursion to $urls */
|
|
|
|
foreach(get_links($child) as $url) {
|
|
|
|
|
|
|
|
$urls[] = $url;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $urls;
|
|
|
|
}
|
|
|
|
|
|
|
|
?>
|