/**
 * Regular expression parser and compiler.
 * @author Shaun Jackman <sdj@sfu.ca>
 * @copyright Copyright 2004 Shaun Jackman
 */


#include "nfa.h"
#include "regex.h"
#include "set.h"
#include "util.h"
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


/** This character is escaped. */
#define ESCAPED 0x80


/** Functions defined on the nodes of the syntax tree. */
typedef struct {
	bool nullable;
	Set firstpos;
	Set lastpos;
} Functions;


/** Input stream. */
static FILE* file;

/** Next character in the input stream. */
static char next;

/** Current line number. */
static int line;


static RegEx* parse_alternation( void);


/** Prints the specified range of characters. */
void
print_range( const char* range)
{
	if( strlen( range) != 1 ||
			range[0] == '[' || isspace( range[0])) {
		putchar( '[');
		print_string( range);
		putchar( ']');
	} else
		print_char( range[0]);
}


/** Prints the specified regular expression. */
void
print_regex( const RegEx* tree)
{
	if( tree == NULL)
		return;
	putchar( '(');
	print_range( tree->data);
	if( tree->left) {
		putchar( ' ');
		print_regex( tree->left);
	}
	if( tree->right) {
		putchar( ' ');
		print_regex( tree->right);
	}
	putchar( ')');
}


/** Destroys the specified regular expression. */
void
destroy_regex( RegEx* tree)
{
	if( tree == NULL)
		return;
	destroy_regex( tree->left);
	destroy_regex( tree->right);
	free( tree);
}


/** Creates a new node with the specified children and data. */
static RegEx*
create_node( RegEx* left, RegEx* right, char data)
{
	RegEx* node = allocate_memory( sizeof *node);
	assume( node != NULL, __FUNCTION__);
	node->left = left;
	node->right = right;
	node->data[0] = data;
	node->data[1] = '\0';
	return node;
}


/** Removes the specified character from the input stream. */
static void
match( char c)
{
	if( next != c) {
		putchar( '\n');
		die( "missing '%c'\n", c);
	}
	switch( next) {
		default: next = getc( file); break;
		case '\n': case EOF: next = 0;
	}
	if( next == '\n')
		line++;
	if( next == '\\') {
		c = getc( file);
		switch( c) {
			case '*': case '+': case '?': case '|':
			case '(': case ')': case '[': case ']':
				next = c | ESCAPED;
				break;
			case 'a': next = '\a' | ESCAPED; break;
			case 'b': next = '\b' | ESCAPED; break;
			case 't': next = '\t' | ESCAPED; break;
			case 'n': next = '\n' | ESCAPED; break;
			case 'v': next = '\v' | ESCAPED; break;
			case 'f': next = '\f' | ESCAPED; break;
			case 'r': next = '\r' | ESCAPED; break;
			case '\\': next = '\\' | ESCAPED; break;
			default: die( "unknown escape sequence '\\%c'\n", c);
		}
	}
}


/** Compares two characters for sorting a character class.
 * ']' comes first, then the rest in ASCII order, finally '^', and
 * '\0'.
 */
int
cmp( const void* pa, const void* pb)
{
	char a = *(const char*)pa;
	char b = *(const char*)pb;
	if( a == b) return 0;
	if( a == 0) return 1;
	if( b == 0) return -1;
	if( a == '^') return 1;
	if( b == '^') return -1;
	if( a == ']') return -1;
	if( b == ']') return 1;
	return a - b;
}


/**
 * class -> a '-' a class
 * class -> a class
 * class -> a '-' a
 * class -> a
 */
static RegEx*
parse_class( void)
{
	RegEx* tree = create_node( NULL, NULL, 0);
	int count = 0;
	do {
		char i, b, a = next & ~ESCAPED;
		assumex( next != '\n', "missing ']' at line %d", line);
		match( next);
		if( next == '-') {
			match( '-');
			b = next & ~ESCAPED;
			assumex( next != '\n', "missing ']' at line %d", line);
			match( next);
		} else
			b = a;
		assume( a <= b, "%c is not less than %c", a, b);
		for( i = a; i <= b; i++)
			tree->data[count++] = i;
	} while( next != ']');
	qsort( tree->data, count, sizeof(char), cmp);
	tree->data[count] = '\0';
	return tree;
}


/**
 * complemented_class -> '^' class
 */
static RegEx*
parse_complemented_class( void)
{
	RegEx* tree;
	int i;
	char data[CHAR_MAX];
	char* p;

	match( '^');
	tree = parse_class();
	strcpy( data, tree->data);

	// Do not include the non-printable characters.
	memset( tree->data, 0, ' ');

	// Include '\t', '\n', and all printable characters in the set.
	tree->data['\t'] = '\t';
	tree->data['\n'] = '\n';
	for( i = ' '; i < CHAR_MAX; i++)
		tree->data[i] = i;

	// Remove all the charcters in the original set.
	for( p = data; *p; p++)
		tree->data[(int)*p] = 0;
	qsort( tree->data, sizeof tree->data, sizeof(char), cmp);
	return tree;
}


/**
 * symbol -> a
 * symbol -> '[' class ']'
 * symbol -> '[' complemented_class ']'
 * symbol -> '(' regex ')'
 */
static RegEx*
parse_symbol( void)
{
	RegEx* tree = NULL;
	switch( next) {
		default:
			tree = create_node( NULL, NULL, next & ~ESCAPED);
			match( next);
			break;
		case '[':
			match( '[');
			if( next == '^')
				tree = parse_complemented_class();
			else
				tree = parse_class();
			match( ']');
			break;
		case '(':
			match( '(');
			tree = parse_alternation();
			match( ')');
			break;
		case '*': case '+': case '?':
		case '|': case ')': case '\n': case EOF:
			die( "unexpected '%c'", next);
	}
	return tree;
}


/**
 * closure -> symbol '*'
 * closure -> symbol '+'
 * closure -> symbol '?'
 * closure -> symbol
 */
static RegEx*
parse_closure( void)
{
	RegEx* tree = parse_symbol();
	switch( next) {
		case '*': case '+': case '?':
			tree = create_node( tree, NULL, next);
			match( next);
			break;
	}
	return tree;
}


/**
 * string -> string closure
 * string -> closure
 */
static RegEx*
parse_string( void)
{
	RegEx* tree = parse_closure();
	while( strchr( "|)\n", next) == NULL)
		tree = create_node( tree, parse_closure(), '.');
	return tree;
}


/**
 * alternation -> alternation '|' string
 * alternation -> string
 */
static RegEx*
parse_alternation( void)
{
	RegEx* tree = parse_string();
	while( next == '|') {
		match( '|');
		tree = create_node( tree, parse_string(), '|');
	}
	return tree;
}


/**
 * regex -> alternation '\n'
 * regex -> '\n'
 * regex -> EOF
 */
RegEx*
parse_regex( FILE* f)
{
	RegEx* tree;
	file = f;
	next = ' ';
	match( next);
	switch( next) {
		default:
			tree = parse_alternation();
			tree = create_node( tree,
					create_node( NULL, NULL, '#'), '.');
			match( '\n');
			break;
		case '\n': match( next); tree = NULL; break;
		case EOF: tree = NULL; break;
	}
	return tree;
}


/** Compiles the specified regular expression. */
static void
compile( NFA* nfa, Functions* f, const RegEx* tree)
{
	if( tree->left == NULL) {
		// this node is a leaf
		if( tree->data) {
			f->nullable = false;
			clear_set( &f->firstpos);
			add_element( &f->firstpos, nfa->count);
			f->lastpos = f->firstpos;
			assumex( nfa->count < STATES,
					"limit of %d NFA states exceeded", STATES);
			strcpy( nfa->states[nfa->count++].symbols, tree->data);
		} else {
			f->nullable = true;
			clear_set( &f->firstpos);
			clear_set( &f->lastpos);
		}
	} else
	switch( tree->data[0]) {
		// this node is not a leaf
		Functions left, right;
		Set set;
		int i;
		case '|':
			compile( nfa, f, tree->left);
			compile( nfa, &right, tree->right);
			f->nullable |= right.nullable;
			add_set( &f->firstpos, &right.firstpos);
			add_set( &f->lastpos, &right.lastpos);
			break;
		case '.':
			compile( nfa, &left, tree->left);
			compile( nfa, &right, tree->right);
			f->nullable = left.nullable && right.nullable;
			f->firstpos = left.firstpos;
			if( left.nullable)
				add_set( &f->firstpos, &right.firstpos);
			f->lastpos = right.lastpos;
			if( right.nullable)
				add_set( &f->lastpos, &left.lastpos);
			set = left.lastpos;
			while( (i = remove_first_element( &set)) != EMPTY_SET)
				add_set( &nfa->states[i].followpos, &right.firstpos);
			break;
		case '*': case'+':
			compile( nfa, f, tree->left);
			if( tree->data[0] == '*')
				f->nullable = true;
			set = f->lastpos;
			while( (i = remove_first_element( &set)) != EMPTY_SET)
				add_set( &nfa->states[i].followpos, &f->firstpos);
			break;
		case '?':
			compile( nfa, f, tree->left);
			f->nullable = true;
			break;
		default:
			die( "unknown operator '%c'", tree->data);
	}
}


/** Compiles the specified regular expression to an NFA.
 * @return id of the compiled regex */
int
compile_regex( NFA* nfa, const RegEx* regex, int token)
{
	Functions f;
	compile( nfa, &f, regex);
	add_set( &nfa->firstpos, &f.firstpos);
	add_set( &nfa->lastpos, &f.lastpos);
	nfa->states[nfa->count-1].token = token;
	return nfa->count - 1;
}
