= 65 && $next_ord <= 90) || ($next_ord >= 97 && $next_ord <= 122) || $next_char == '/')) { $tag_start = true; $tag_start_pos = $i; $tag_type = ($next_char != '/' ? HTML_TAG_TYPE_OPEN : HTML_TAG_TYPE_CLOSE); continue; } if ($tag_start) { // We aren't currently parsing a quoted attribute value or we haven't hit the start of a new tag (unclosed quote) // Treat special characters as special if (!$attr_value_start_quote || $next_char == '<') { // We haven't hit the tag close or a new tag (unclosed tag) if ($char != '>' && $next_char != '<') { // Just finished parsing something if ($char == ' ') { // If we haven't finished parsing the tag name yet, this must be the end of it if (!$tag_name_done) $tag_name_done = true; // Just finished parsing attribute value or attribute name with no value else { $attributes[$attr_name] = $attr_value; $attr_name = $attr_value = ''; $attr_name_done = false; } } elseif (!$attr_name_done && $char == '=') $attr_name_done = true; elseif ($tag_type == HTML_TAG_TYPE_OPEN && $char == '/' && $next_char == '>' && strtolower($attr_name) != 'href' && strtolower($attr_name) != 'src') $tag_type = HTML_TAG_TYPE_SELFCLOSE; // Accumulate else { $ord = ord($char); if (!$tag_name_done) { // Strip forward slashes from (closing) tag name if ($char != '/') { // Char to lower if necessary if ($lower_case && $ord >= 65 && $ord <= 90) { $char = chr($ord + 32); $tag_error |= HTML_ERROR_IMPROPER_CASE; $valid = false; } $tag_name .= $char; } } elseif (!$attr_name_done) { // Char to lower if necessary if ($lower_case && $ord >= 65 && $ord <= 90) { $char = chr($ord + 32); $tag_error |= HTML_ERROR_IMPROPER_CASE; $valid = false; } $attr_name .= $char; } else { // Just starting to parse attr value if ($attr_value == '') { // We found an attribute value start quote if ($char == "'" || $char == '"') $attr_value_start_quote = $char; else { $tag_error |= HTML_ERROR_ATTR_VAL_NOT_QUOTED; $valid = false; } } $attr_value .= $char; } } // Insert missing close bracket at end of html if ($next_char == '') { $html .= '>'; $len++; $tag_error |= HTML_ERROR_UNCLOSED_TAG; $valid = false; } } // We found closing bracket or start of next tag else { // Mark tag as unclosed and accumulate last char if ($next_char == '<' && $char != '>') { if (!$tag_name_done) { // Char to lower if necessary if ($lower_case && $ord >= 65 && $ord <= 90) { $char = chr($ord + 32); $tag_error |= HTML_ERROR_IMPROPER_CASE; $valid = false; } $tag_name .= $char; } elseif (!$attr_name_done) { // Char to lower if necessary if ($lower_case && $ord >= 65 && $ord <= 90) { $char = chr($ord + 32); $tag_error |= HTML_ERROR_IMPROPER_CASE; $valid = false; } $attr_name .= $char; } elseif (!$attr_value_start_quote) $attr_value .= $char; $tag_error |= HTML_ERROR_UNCLOSED_TAG; $valid = false; } // Properly close quote if ($attr_value_start_quote) { if ($char != $attr_value_start_quote) $attr_value .= $char; $attr_value .= $attr_value_start_quote; $tag_error |= HTML_ERROR_ATTR_VAL_UNCLOSED_QUOTE; $valid = false; } // Store last attribute name/value if ($tag_name_done && $attr_name != '') $attributes[$attr_name] = $attr_value; // Strip attributes from closing tag if ($tag_type == HTML_TAG_TYPE_CLOSE) { if (!empty($attributes)) { $attributes = array(); $tag_error |= HTML_ERROR_ATTR_IN_CLOSE; $valid = false; } } // Quote unquoted values elseif (($tag_error & HTML_ERROR_ATTR_VAL_NOT_QUOTED) != 0) foreach ($attributes as $attr_name => $attr_value) if (isset($attr_value[0]) && $attr_value[0] != "'" && $attr_value[0] != '"') $attributes[$attr_name] = $quote_type . $attr_value . $quote_type; $tag_arr = array('type' => $tag_type, 'error' => $tag_error, 'name' => $tag_name, 'attributes' => $attributes); $tag_name_lower = strtolower($tag_name); // Initialize for safety $tag = ''; // Check nesting if (!in_array($tag_name_lower, $SELF_CLOSED_TAGS)) { if ($tag_type == HTML_TAG_TYPE_OPEN) array_push($open_tag_stack, $tag_arr); elseif ($tag_type == HTML_TAG_TYPE_CLOSE) { $open_tag_found = false; if (!empty($open_tag_stack)) { // Pop off the open tag stack until we find a match or run out do { $last = end($open_tag_stack); $tag_name_prev = strtolower($last['name']); // Improper nesting, we need to insert a proper closing tag if ($tag_name_prev != $tag_name_lower) { // Don't prematurely close tables because of improper nesting if ($tag_name_prev == 'table' || $tag_name_prev == 'tbody') break; $tag .= ""; $tags[] = array('type' => HTML_TAG_TYPE_CLOSE, 'error' => HTML_ERROR_IMPROPER_NESTING, 'name' => $tag_name_prev, 'attributes' => array()); $tag_error |= HTML_ERROR_IMPROPER_NESTING; $tag_arr['error'] = $tag_error; $valid = false; } else $open_tag_found = true; array_pop($open_tag_stack); } while ($tag_name_prev != $tag_name_lower && !empty($open_tag_stack)); } // Mark closing tag for deletion if (!$open_tag_found) { $tag_error |= HTML_ERROR_UNMATCHED_CLOSING; $tag_arr['error'] = $tag_error; $valid = false; } } } // Replace bad tag with rebuilt valid one, unless marked for deletion if ($tag_error != 0 && ($tag_error & HTML_ERROR_UNMATCHED_CLOSING) == 0) { $tag .= '<'; if ($tag_type == HTML_TAG_TYPE_CLOSE) $tag .= '/'; $tag .= $tag_name; foreach ($attributes as $attr_name => $attr_value) { $tag .= " $attr_name"; if ($attr_value != '') $tag .= "=$attr_value"; } if ($tag_type == HTML_TAG_TYPE_SELFCLOSE) $tag .= ' /'; $tag .= '>'; } // Some html modification is required (replacing invalid tag or tag marked for deletion) if ($tag != '' || ($tag_error & HTML_ERROR_UNMATCHED_CLOSING) != 0) { $tag_len = strlen($tag); $beg = substr($html, 0, $tag_start_pos); $end = ($i < $len-1 ? substr($html, $i+1) : ''); // Insert new html $html = $beg . $tag . $end; // Update $len $tag_len_old = ($i-$tag_start_pos)+1; $len += $tag_len - $tag_len_old; // Move position to end of new html $i = $tag_start_pos + ($tag_len-1); } // Add to array unless marked for deletion if (($tag_error & HTML_ERROR_UNMATCHED_CLOSING) == 0) $tags[] = $tag_arr; // Re-initialize for safety $tag_error = HTML_ERROR_NONE; $tag_start = false; $tag_start_pos = -1; $tag_type = 0; $tag_name = ''; $tag_name_done = false; $attr_name = ''; $attr_name_done = false; $attr_value = ''; $attr_value_start_quote = false; $attributes = array(); } } // We are currently parsing a quoted attribute value, keep accumulating until we encounter end condition (closing quote, end of tag, end of html) // End condition where we hit a new tag is handled by 'close tag' block of code else { $attr_value .= $char; // Found end condition, turn quoted value parsing off if ($char == $attr_value_start_quote || $next_char == '>' || $next_char == '') { if ($next_char == '>' || $next_char == '') { // Unclosed quote if ($char != $attr_value_start_quote) { $attr_value .= $attr_value_start_quote; $tag_error |= HTML_ERROR_ATTR_VAL_UNCLOSED_QUOTE; $valid = false; } // Unclosed tag at end of html if ($next_char == '') { // Insert both missing quote and closing bracket $html .= '>'; $len++; $tag_error |= HTML_ERROR_UNCLOSED_TAG; $valid = false; } } // Run-on attribute, insert space before next attribute elseif ($char == $attr_value_start_quote && (($next_ord >= 65 && $next_ord <= 90) || ($next_ord >= 97 && $next_ord <= 122))) { $beg = substr($html, 0, $i+1); $end = substr($html, $i+1); $html = $beg . ' ' . $end; $len++; } $attr_value_start_quote = false; } } } } // Go back and properly close/nest tags while (!empty($open_tag_stack)) { $last = array_pop($open_tag_stack); $tag_name_prev = strtolower($last['name']); $html .= ""; $tags[] = array('type' => HTML_TAG_TYPE_CLOSE, 'error' => HTML_ERROR_IMPROPER_NESTING, 'name' => $tag_name_prev, 'attributes' => array()); $valid = false; } //print_r($tags); return $valid; } ?>