I wrote this FSM to tokenize HTML for my syntax highlighter, it works pretty well and its fast but its currently ugly as all heck and could definitely use some love so if anyone is up for taking a whack at cleaning it up or giving me some speed hints it would be much appreciated.
Warning: Big Code Block
<?php
/**
* Simple FSM for tokenizing HTML, there are probably more accurate,
* better written tokenizers out there but I didn't feel like searching
* so I wrote this, it's fast enough for me.
*
* @author Shawn Biddle (email redacted)
* @version 0.5
*/
define('T_TEXT', 0);
define('T_TAG_START', 1);
define('T_TAG_NAME', 2);
define('T_TAG_END', 3);
define('T_STRING_START', 4);
define('T_STRING_END', 5);
define('T_TAG_END_NAME', 6);
define('T_END_SLASH', 7);
define('T_HTML_COMMENT', 8);
/*
There is currently no state table built for this so it's quite ugly,
feel free to clean it up later
*/
function tokenize_html($output)
{
$states = array('T_TEXT', 'T_TAG_START', 'T_TAG_NAME', 'T_TAG_END', 'T_STRING_STRING', 'T_STRING_END',
'T_TAG_END_NAME', 'T_END_SLASH', 'T_HTML_COMMENT');
$i = 0;
$ret_tokens = array();
$state = NULL;
$char = '';
$prev_char = '';
$cur_state_string = '';
$delimiter = '';
/*
This isn't while($char = $output[$i]) because if it comes across a 0 in the text
it will die because of the implicit cast string -> int -> boolean
*/
while(isset($output[$i])) {
$char = $output[$i++];
switch(TRUE) {
case (preg_match('/[^\<\>\/\'"]/ism', $char)):
{
if ($state === NULL || $state == T_STRING_END || $state == T_TAG_END) {
$state = T_TEXT;
$cur_state_string = $char;
break;
}
if ($state == T_TAG_NAME && $char == ' ') {
$state = T_TEXT;
$ret_tokens[] = array('token' => 'T_TAG_NAME', 'string' => $cur_state_string);
$cur_state_string = $char;
break;
}
if ($state == T_TAG_NAME) {
if ($prev_char == '!' && $char == '-') {
$state = T_HTML_COMMENT;
$cur_state_string .= $char;
break;
}
}
if ($state == T_TAG_END_NAME && $char == ' ') {
$state = T_TEXT;
$ret_tokens[] = array('token' => 'T_TAG_END_NAME', 'string' => $cur_state_string);
$cur_state_string = $char;
break;
}
if ($state == T_TAG_START) {
if (!preg_match('/[a-z!]/i', $char)) {
$state = T_TEXT;
$cur_state_string .= $char;
break;
}
$ret_tokens[] = array('token' => 'T_TAG_START', 'string' => '<');
$state = T_TAG_NAME;
$cur_state_string = $char;
break;
}
if ($state == T_END_SLASH) {
$state = T_TAG_END_NAME;
$ret_tokens[] = array('token' => 'T_END_SLASH', 'string' => $cur_state_string);
$cur_state_string = $char;
break;
}
$cur_state_string .= $char;
break;
}
case ($char == '<'):
{
if ($state == T_STRING_START) {
$cur_state_string .= $char;
break;
}
if ($state == T_TEXT) {
$ret_tokens[] = array('token' => 'T_TEXT', 'string' => $cur_state_string);
$state = T_TAG_START;
$cur_state_string = $char;
break;
}
break;
}
case ($char == '>'):
{
if ($state == T_STRING_START) {
$cur_state_string .= $char;
break;
}
if ($state == T_HTML_COMMENT) {
if ($prev_char == '-') {
$state = T_TEXT;
$cur_state_string .= $char;
$ret_tokens[] = array('token' => 'T_HTML_COMMENT', 'string' => $cur_state_string);
$cur_state_string = '';
break;
}
}
if ($state == T_TAG_END_NAME || $state == T_TAG_NAME) {
$ret_tokens[] = array('token' => $states[$state], 'string' => $cur_state_string);
$ret_tokens[] = array('token' => 'T_TAG_END', 'string' => '>');
$state = T_TAG_END;
$cur_state_string = '';
break;
}
if ($state == T_STRING_END || $state == T_END_SLASH) {
$state = T_TAG_END;
$ret_tokens[] = array('token' => 'T_TAG_END', 'string' => '>');
$cur_state_string = '';
break;
}
break;
}
case ($char == '"' || $char == "'"):
{
if ($state == T_STRING_START && ($char == $delimiter && $prev_char == '\\')) {
$cur_state_string .= $char;
break;
}
if ($state == T_STRING_START && $char == $delimeter) {
$state = T_STRING_END;
$cur_state_string .= $char;
$ret_tokens[] = array('token' => 'T_HTML_STRING', 'string' => $cur_state_string);
$cur_state_string = '';
break;
}
if ($state == T_TEXT && $prev_char == '=') {
$state = T_STRING_START;
$ret_tokens[] = array('token' => 'T_TEXT', 'string' => $cur_state_string);
$cur_state_string = $char;
$delimeter = $char;
break;
}
$cur_state_string .= $char;
break;
}
case ($char == '/'):
{
if ($state != T_TAG_START) {
$cur_state_string .= $char;
break;
}
if ($state == T_TAG_START) {
$ret_tokens[] = array('token' => 'T_TAG_START', 'string' => '<');
$state = T_END_SLASH;
$cur_state_string = $char;
break;
}
break;
}
}
$prev_char = $char;
}
$ret_tokens[] = array('token' => $states[$state], 'string' => $cur_state_string);
return $ret_tokens;
}