pgLab/core/SqlLexer.cpp

#include "SqlLexer.h"

SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
	: m_block(std::move(block))
	, m_state(currentstate)
	, m_returnWhitespace(return_whitespace)
{}

QChar SqlLexer::nextChar()
{
	QChar result = QChar::Null;
	if (m_pos < m_block.size()) {
		result = m_block.at(m_pos++);
	}
	return result;
}

QChar SqlLexer::peekChar()
{
	QChar result = QChar::Null;
	if (m_pos < m_block.size()) {
		result = m_block.at(m_pos);
	}
	return result;
}


//self			,()\[\].;\:\+\-\*\/\%\^\<\>\=
template <typename C>
inline bool isSelf(C c)
{
	return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
		|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
		|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
}


//+ - * / < > = ~ ! @ # % ^ & | ` ?
//There are a few restrictions on your choice of name:
//   -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
//   A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
//   ~ ! @ # % ^ & | ` ?
//   For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
//   The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
//The operator != is mapped to <> on input, so these two names are always equivalent.
template <typename C>
inline bool isOperatorChar(C c)
{
	return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
		|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
		|| c == '|' || c == '`' || c == '?';
}

//typecast		"::" IMPLEMENTED
//dot_dot			\.\. TODO
//colon_equals	":="  TODO
//equals_greater	"=>"  TODO
//less_equals		"<="  TODO
//greater_equals	">="  TODO
//less_greater	"<>"  TODO
//not_equals		"!="  TODO

// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l

/**
 * @brief NextBasicToken
 * @param in
 * @param ofs
 * @param start
 * @param length
 * @return false when input seems invalid, it will return what it did recognize but something
 *    wasn't right, parser should try to recover
 */
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
{
	// Basically chops based on white space
	// it does also recognize comments and quoted strings/identifiers
	while (true) {
		startpos = m_pos;
		QChar c = nextChar();

        if (c == '\n') {
            if (m_returnWhitespace) {
                length = m_pos - startpos;
                tokentype = BasicTokenType::NewLine;
                out = "\n";
                return true;
            }
        }
        else if (c.isSpace()) {
            // Just skip whitespace
            if (m_returnWhitespace) {
                for (;;) {
                    c = peekChar();
                    if (c != QChar::Null && c.isSpace() && c != '\n')
                        nextChar();
                    else
                        break;
                }
                length = m_pos - startpos;
                tokentype = BasicTokenType::WhiteSpace;
                out = m_block.mid(startpos, length);
                return true;
            }
        }
        else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
            // Loop till end of line or end of block
            c = nextChar();
            for (;;) {
                c = peekChar();
                if (c != QChar::Null && c != '\n')
                    nextChar();
                else
                    break;
            }
            length = m_pos - startpos;
            tokentype = BasicTokenType::Comment;
            out = m_block.mid(startpos, length);
            return true;
        }
        else if (c == ':') {
            c = peekChar();
            if (c == ':') {
                nextChar();
                length = m_pos - startpos;
                tokentype = BasicTokenType::Cast;
                out = m_block.mid(startpos, length);
                return true;
            }
        }
        else if (isSelf(c)) {
            length = m_pos - startpos;
            if (c == ',')
                tokentype = BasicTokenType::Comma;
            else
                tokentype = BasicTokenType::Self;

            out = m_block.mid(startpos, length);
            return true;
        }
        else if (isOperatorChar(c)) {
            while (true) {
                QChar c = peekChar();
                if (isOperatorChar(c)) {
                    nextChar();
                }
                else {
                    // unexpected end, pretend nothings wrong
                    length = m_pos - startpos;
                    tokentype = BasicTokenType::Operator;
                    out = m_block.mid(startpos, length);
                    return true;
                }
            }
        }
        else if (c == '\'') {
            // Single quoted string so it's an SQL text literal
            if (parseSingleQuotedString(startpos, length, tokentype)) {
                out = m_block.mid(startpos, length);
                return true;
            }
            return false;
        }
        else if (c == '"') {
            // Double quoted identifier
            if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
                out = m_block.mid(startpos, length);
                return true;
            }
            return false;
        }
        else if (c == QChar::Null) {
            length = 0;
            tokentype = BasicTokenType::End;
            return true;
        }
        else if (c == '$') {
            return parseDollarQuote(startpos, length, tokentype, out);
        }
        else {
            // Undetermined symbol
            for (;;) {
                c = peekChar();
                if (c.isLetterOrNumber() || c == '_')
                    nextChar();
                else
                    break;
            }
            length = m_pos - startpos;
            tokentype = BasicTokenType::Symbol;
            out = m_block.mid(startpos, length);
            return true;
        }
	}
	return false;
}


bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
{
    while (true) {
        QChar c = peekChar();
        if (c == QChar::Null || c == '\n') {
            // unexpected end, pretend nothings wrong
            length = m_pos - startpos;
            tokentype = BasicTokenType::QuotedString;
            return true;
        }

		nextChar();
		if (c == '\'') {
			// maybe end of string literal
			if (peekChar() == '\'') {
				// Nope, just double quote to escape quote
				nextChar(); // eat it
			}
			else {
				length = m_pos - startpos;
				tokentype = BasicTokenType::QuotedString;
				return true;
			}
		}
    }

}

bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
{
    while (true) {
        QChar c = peekChar();
        if (c == QChar::Null || c == '\n') {
            // unexpected end, pretend nothings wrong
            length = m_pos - startpos;
            tokentype = BasicTokenType::QuotedIdentifier;
            return true;
        }

		nextChar();
		if (c == '"') {
			// maybe end of string literal
			if (peekChar() == '"') {
				// Nope, just double quote to escape quote
				nextChar(); // eat it
			}
			else {
				length = m_pos - startpos;
				tokentype = BasicTokenType::QuotedIdentifier;
				return true;
			}
		}
    }
}

bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
{
    QChar c = nextChar();
    if (c == '$') {
        tokentype = BasicTokenType::DollarQuote;
        length = m_pos - startpos;
        out = m_block.mid(startpos, length);
        return true;
    }

    if (c.isDigit()) {
        for (;;) {
            c = peekChar();
            if (c.isDigit())
                nextChar();
            else
                break;
        }
        tokentype = BasicTokenType::Parameter;
        length = m_pos - startpos;
        out = m_block.mid(startpos, length);
        return true;
    }

	if (c.isLetter()) {
        while (true) {
            c = nextChar();
            if (c == '$') {
                tokentype = BasicTokenType::DollarQuote;
                length = m_pos - startpos;
                out = m_block.mid(startpos, length);
                return true;
            }

			if (!c.isLetter()) {
                tokentype = BasicTokenType::None;
                length = m_pos - startpos;
                out = m_block.mid(startpos, length);
                return false;
            }
        }
    }

    return false;
}