= 65 && $next_ord <= 90) || ($next_ord >= 97 && $next_ord <= 122) || $next_char == '/'))
{
$tag_start = true;
$tag_start_pos = $i;
$tag_type = ($next_char != '/' ? HTML_TAG_TYPE_OPEN : HTML_TAG_TYPE_CLOSE);
continue;
}
if ($tag_start)
{
// We aren't currently parsing a quoted attribute value or we haven't hit the start of a new tag (unclosed quote)
// Treat special characters as special
if (!$attr_value_start_quote || $next_char == '<')
{
// We haven't hit the tag close or a new tag (unclosed tag)
if ($char != '>' && $next_char != '<')
{
// Just finished parsing something
if ($char == ' ')
{
// If we haven't finished parsing the tag name yet, this must be the end of it
if (!$tag_name_done)
$tag_name_done = true;
// Just finished parsing attribute value or attribute name with no value
else
{
$attributes[$attr_name] = $attr_value;
$attr_name = $attr_value = '';
$attr_name_done = false;
}
}
elseif (!$attr_name_done && $char == '=')
$attr_name_done = true;
elseif ($tag_type == HTML_TAG_TYPE_OPEN && $char == '/' && $next_char == '>' && strtolower($attr_name) != 'href' && strtolower($attr_name) != 'src')
$tag_type = HTML_TAG_TYPE_SELFCLOSE;
// Accumulate
else
{
$ord = ord($char);
if (!$tag_name_done)
{
// Strip forward slashes from (closing) tag name
if ($char != '/')
{
// Char to lower if necessary
if ($lower_case && $ord >= 65 && $ord <= 90)
{
$char = chr($ord + 32);
$tag_error |= HTML_ERROR_IMPROPER_CASE;
$valid = false;
}
$tag_name .= $char;
}
}
elseif (!$attr_name_done)
{
// Char to lower if necessary
if ($lower_case && $ord >= 65 && $ord <= 90)
{
$char = chr($ord + 32);
$tag_error |= HTML_ERROR_IMPROPER_CASE;
$valid = false;
}
$attr_name .= $char;
}
else
{
// Just starting to parse attr value
if ($attr_value == '')
{
// We found an attribute value start quote
if ($char == "'" || $char == '"')
$attr_value_start_quote = $char;
else
{
$tag_error |= HTML_ERROR_ATTR_VAL_NOT_QUOTED;
$valid = false;
}
}
$attr_value .= $char;
}
}
// Insert missing close bracket at end of html
if ($next_char == '')
{
$html .= '>';
$len++;
$tag_error |= HTML_ERROR_UNCLOSED_TAG;
$valid = false;
}
}
// We found closing bracket or start of next tag
else
{
// Mark tag as unclosed and accumulate last char
if ($next_char == '<' && $char != '>')
{
if (!$tag_name_done)
{
// Char to lower if necessary
if ($lower_case && $ord >= 65 && $ord <= 90)
{
$char = chr($ord + 32);
$tag_error |= HTML_ERROR_IMPROPER_CASE;
$valid = false;
}
$tag_name .= $char;
}
elseif (!$attr_name_done)
{
// Char to lower if necessary
if ($lower_case && $ord >= 65 && $ord <= 90)
{
$char = chr($ord + 32);
$tag_error |= HTML_ERROR_IMPROPER_CASE;
$valid = false;
}
$attr_name .= $char;
}
elseif (!$attr_value_start_quote)
$attr_value .= $char;
$tag_error |= HTML_ERROR_UNCLOSED_TAG;
$valid = false;
}
// Properly close quote
if ($attr_value_start_quote)
{
if ($char != $attr_value_start_quote)
$attr_value .= $char;
$attr_value .= $attr_value_start_quote;
$tag_error |= HTML_ERROR_ATTR_VAL_UNCLOSED_QUOTE;
$valid = false;
}
// Store last attribute name/value
if ($tag_name_done && $attr_name != '')
$attributes[$attr_name] = $attr_value;
// Strip attributes from closing tag
if ($tag_type == HTML_TAG_TYPE_CLOSE)
{
if (!empty($attributes))
{
$attributes = array();
$tag_error |= HTML_ERROR_ATTR_IN_CLOSE;
$valid = false;
}
}
// Quote unquoted values
elseif (($tag_error & HTML_ERROR_ATTR_VAL_NOT_QUOTED) != 0)
foreach ($attributes as $attr_name => $attr_value)
if (isset($attr_value[0]) && $attr_value[0] != "'" && $attr_value[0] != '"')
$attributes[$attr_name] = $quote_type . $attr_value . $quote_type;
$tag_arr = array('type' => $tag_type, 'error' => $tag_error, 'name' => $tag_name, 'attributes' => $attributes);
$tag_name_lower = strtolower($tag_name);
// Initialize for safety
$tag = '';
// Check nesting
if (!in_array($tag_name_lower, $SELF_CLOSED_TAGS))
{
if ($tag_type == HTML_TAG_TYPE_OPEN)
array_push($open_tag_stack, $tag_arr);
elseif ($tag_type == HTML_TAG_TYPE_CLOSE)
{
$open_tag_found = false;
if (!empty($open_tag_stack))
{
// Pop off the open tag stack until we find a match or run out
do
{
$last = end($open_tag_stack);
$tag_name_prev = strtolower($last['name']);
// Improper nesting, we need to insert a proper closing tag
if ($tag_name_prev != $tag_name_lower)
{
// Don't prematurely close tables because of improper nesting
if ($tag_name_prev == 'table' || $tag_name_prev == 'tbody')
break;
$tag .= "$tag_name_prev>";
$tags[] = array('type' => HTML_TAG_TYPE_CLOSE, 'error' => HTML_ERROR_IMPROPER_NESTING, 'name' => $tag_name_prev, 'attributes' => array());
$tag_error |= HTML_ERROR_IMPROPER_NESTING;
$tag_arr['error'] = $tag_error;
$valid = false;
}
else
$open_tag_found = true;
array_pop($open_tag_stack);
} while ($tag_name_prev != $tag_name_lower && !empty($open_tag_stack));
}
// Mark closing tag for deletion
if (!$open_tag_found)
{
$tag_error |= HTML_ERROR_UNMATCHED_CLOSING;
$tag_arr['error'] = $tag_error;
$valid = false;
}
}
}
// Replace bad tag with rebuilt valid one, unless marked for deletion
if ($tag_error != 0 && ($tag_error & HTML_ERROR_UNMATCHED_CLOSING) == 0)
{
$tag .= '<';
if ($tag_type == HTML_TAG_TYPE_CLOSE)
$tag .= '/';
$tag .= $tag_name;
foreach ($attributes as $attr_name => $attr_value)
{
$tag .= " $attr_name";
if ($attr_value != '')
$tag .= "=$attr_value";
}
if ($tag_type == HTML_TAG_TYPE_SELFCLOSE)
$tag .= ' /';
$tag .= '>';
}
// Some html modification is required (replacing invalid tag or tag marked for deletion)
if ($tag != '' || ($tag_error & HTML_ERROR_UNMATCHED_CLOSING) != 0)
{
$tag_len = strlen($tag);
$beg = substr($html, 0, $tag_start_pos);
$end = ($i < $len-1 ? substr($html, $i+1) : '');
// Insert new html
$html = $beg . $tag . $end;
// Update $len
$tag_len_old = ($i-$tag_start_pos)+1;
$len += $tag_len - $tag_len_old;
// Move position to end of new html
$i = $tag_start_pos + ($tag_len-1);
}
// Add to array unless marked for deletion
if (($tag_error & HTML_ERROR_UNMATCHED_CLOSING) == 0)
$tags[] = $tag_arr;
// Re-initialize for safety
$tag_error = HTML_ERROR_NONE;
$tag_start = false;
$tag_start_pos = -1;
$tag_type = 0;
$tag_name = '';
$tag_name_done = false;
$attr_name = '';
$attr_name_done = false;
$attr_value = '';
$attr_value_start_quote = false;
$attributes = array();
}
}
// We are currently parsing a quoted attribute value, keep accumulating until we encounter end condition (closing quote, end of tag, end of html)
// End condition where we hit a new tag is handled by 'close tag' block of code
else
{
$attr_value .= $char;
// Found end condition, turn quoted value parsing off
if ($char == $attr_value_start_quote || $next_char == '>' || $next_char == '')
{
if ($next_char == '>' || $next_char == '')
{
// Unclosed quote
if ($char != $attr_value_start_quote)
{
$attr_value .= $attr_value_start_quote;
$tag_error |= HTML_ERROR_ATTR_VAL_UNCLOSED_QUOTE;
$valid = false;
}
// Unclosed tag at end of html
if ($next_char == '')
{
// Insert both missing quote and closing bracket
$html .= '>';
$len++;
$tag_error |= HTML_ERROR_UNCLOSED_TAG;
$valid = false;
}
}
// Run-on attribute, insert space before next attribute
elseif ($char == $attr_value_start_quote && (($next_ord >= 65 && $next_ord <= 90) || ($next_ord >= 97 && $next_ord <= 122)))
{
$beg = substr($html, 0, $i+1);
$end = substr($html, $i+1);
$html = $beg . ' ' . $end;
$len++;
}
$attr_value_start_quote = false;
}
}
}
}
// Go back and properly close/nest tags
while (!empty($open_tag_stack))
{
$last = array_pop($open_tag_stack);
$tag_name_prev = strtolower($last['name']);
$html .= "$tag_name_prev>";
$tags[] = array('type' => HTML_TAG_TYPE_CLOSE, 'error' => HTML_ERROR_IMPROPER_NESTING, 'name' => $tag_name_prev, 'attributes' => array());
$valid = false;
}
//print_r($tags);
return $valid;
}
?>