pgLab/core/SqlLexer.cpp

#include "SqlLexer.h"

SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
	: m_block(std::move(block))
	, m_state(currentstate)
	, m_returnWhitespace(return_whitespace)
{}

QChar SqlLexer::nextChar()
{
	QChar result = QChar::Null;
	if (m_pos < m_block.size()) {
		result = m_block.at(m_pos++);
	}
	return result;
}

QChar SqlLexer::peekChar()
{
	QChar result = QChar::Null;
	if (m_pos < m_block.size()) {
		result = m_block.at(m_pos);
	}
	return result;
}


//self			,()\[\].;\:\+\-\*\/\%\^\<\>\=
template <typename C>
inline bool isSelf(C c)
{
	return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
		|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
		|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
}


//+ - * / < > = ~ ! @ # % ^ & | ` ?
//There are a few restrictions on your choice of name:
//   -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
//   A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
//   ~ ! @ # % ^ & | ` ?
//   For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
//   The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
//The operator != is mapped to <> on input, so these two names are always equivalent.
template <typename C>
inline bool isOperatorChar(C c)
{
	return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
		|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
		|| c == '|' || c == '`' || c == '?';
}

//typecast		"::" IMPLEMENTED
//dot_dot			\.\. TODO
//colon_equals	":="  TODO
//equals_greater	"=>"  TODO
//less_equals		"<="  TODO
//greater_equals	">="  TODO
//less_greater	"<>"  TODO
//not_equals		"!="  TODO

// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l

/**
 * @brief NextBasicToken
 * @param in
 * @param ofs
 * @param start
 * @param length
 * @return false when input seems invalid, it will return what it did recognize but something
 *    wasn't right, parser should try to recover
 */
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
{
	// Basically chops based on white space
	// it does also recognize comments and quoted strings/identifiers
	while (true) {
		startpos = m_pos;
		QChar c = nextChar();
//		if (LexerState::Null == m_state) {
			if (c == '\n') {
				if (m_returnWhitespace) {
					length = m_pos - startpos;
					tokentype = BasicTokenType::NewLine;
					out = "\n";
					return true;
				}
			}
			else if (c.isSpace()) {
				// Just skip whitespace
				if (m_returnWhitespace) {
					for (;;) {
						c = peekChar();
						if (c != QChar::Null && c.isSpace() && c != '\n')
							nextChar();
						else
							break;
					}
					length = m_pos - startpos;
					tokentype = BasicTokenType::WhiteSpace;
                    out = m_block.mid(startpos, length);
					return true;
				}
			}
			else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
				// Loop till end of line or end of block
				c = nextChar();
				for (;;) {
					c = peekChar();
					if (c != QChar::Null && c != '\n')
						nextChar();
					else
						break;
				}
				length = m_pos - startpos;
				tokentype = BasicTokenType::Comment;
                out = m_block.mid(startpos, length);
				return true;
			}
			else if (c == ':') {
				c = peekChar();
				if (c == ':') {
					nextChar();
					length = m_pos - startpos;
					tokentype = BasicTokenType::Cast;
                    out = m_block.mid(startpos, length);
                    return true;
				}
			}
			else if (isSelf(c)) {
				length = m_pos - startpos;
				if (c == ',')
					tokentype = BasicTokenType::Comma;
				else
					tokentype = BasicTokenType::Self;

                out = m_block.mid(startpos, length);
                return true;
			}
			else if (isOperatorChar(c)) {
				while (true) {
					QChar c = peekChar();
					if (isOperatorChar(c)) {
						nextChar();
					}
					else {
						// unexpected end, pretend nothings wrong
						length = m_pos - startpos;
						tokentype = BasicTokenType::Operator;
                        out = m_block.mid(startpos, length);
                        return true;
					}
				}
			}
			else if (c == '\'') {
				// Single quoted string so it's an SQL text literal
				if (parseSingleQuotedString(startpos, length, tokentype)) {
                    out = m_block.mid(startpos, length);
                    return true;
				}
				return false;
			}
			else if (c == '"') {
				// Double quoted identifier
				if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
                    out = m_block.mid(startpos, length);
                    return true;
				}
				return false;
			}
//			else if (c == '/' && peekChar() == '*') {
//				nextChar();
//				m_state = LexerState::InBlockComment;
//			}
			else if (c == QChar::Null) {
				length = 0;
				tokentype = BasicTokenType::End;
				return true;
			}
			else if (c == '$') {
                return parseDollarQuote(startpos, length, tokentype, out);
            }
			else {
				// Undetermined symbol
				for (;;) {
					c = peekChar();
					if (c.isLetterOrNumber() || c == '_')
						nextChar();
					else
						break;
				}
				length = m_pos - startpos;
				tokentype = BasicTokenType::Symbol;
                out = m_block.mid(startpos, length);
                return true;
			}
//		}
//		else if (LexerState::InBlockComment == m_state) {
//			if (c == QChar::Null) {
//				// eof current buffer, we need to return state so
//				if (m_pos == startpos) {
//					break;
//				}
//				else {
//					length = m_pos - startpos;
//					tokentype = BasicTokenType::OpenBlockComment;
//					return true;
//				}
//			}
//			else if (c == '*') {
//				nextChar();
//				if (peekChar() == '/') {
//					nextChar();
//					length = m_pos - startpos;
//					tokentype = BasicTokenType::BlockComment;
//					m_state = LexerState::Null;
//					return true;
//				}
//			}
//		}
	}
	return false;
}


bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
{
    while (true) {
        QChar c = peekChar();
        if (c == QChar::Null || c == '\n') {
            // unexpected end, pretend nothings wrong
            length = m_pos - startpos;
            tokentype = BasicTokenType::QuotedString;
            return true;
        }

		nextChar();
		if (c == '\'') {
			// maybe end of string literal
			if (peekChar() == '\'') {
				// Nope, just double quote to escape quote
				nextChar(); // eat it
			}
			else {
				length = m_pos - startpos;
				tokentype = BasicTokenType::QuotedString;
				return true;
			}
		}
    }

}

bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
{
    while (true) {
        QChar c = peekChar();
        if (c == QChar::Null || c == '\n') {
            // unexpected end, pretend nothings wrong
            length = m_pos - startpos;
            tokentype = BasicTokenType::QuotedIdentifier;
            return true;
        }

		nextChar();
		if (c == '"') {
			// maybe end of string literal
			if (peekChar() == '"') {
				// Nope, just double quote to escape quote
				nextChar(); // eat it
			}
			else {
				length = m_pos - startpos;
				tokentype = BasicTokenType::QuotedIdentifier;
				return true;
			}
		}
    }
}

bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
{
    QChar c = nextChar();
    if (c.isDigit()) {
        for (;;) {
            c = peekChar();
            if (c.isDigit())
                nextChar();
            else
                break;
        }
        tokentype = BasicTokenType::Parameter;
        length = m_pos - startpos;
        out = m_block.mid(startpos, length);
        return true;
    }

	if (c.isLetter()) {
        // is this a dollar quote?
        while (true) {
            c = nextChar();
            if (c == '$') {
                // Found valid dollar quote
                tokentype = BasicTokenType::DollarQuote;
                length = m_pos - startpos;
                out = m_block.mid(startpos, length);
                return true;
            }

			if (!c.isLetter()) {
                // ERROR, unallowed character
                tokentype = BasicTokenType::None;
                length = m_pos - startpos;
                out = m_block.mid(startpos, length);
                return false;
            }
        }
    }

    return false;
}