pgLab/core/SqlLexer.cpp
2022-05-26 08:25:31 +02:00

292 lines
8.5 KiB
C++

#include "SqlLexer.h"
SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
: m_block(std::move(block))
, m_state(currentstate)
, m_returnWhitespace(return_whitespace)
{}
QChar SqlLexer::nextChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos++);
}
return result;
}
QChar SqlLexer::peekChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos);
}
return result;
}
//self ,()\[\].;\:\+\-\*\/\%\^\<\>\=
template <typename C>
inline bool isSelf(C c)
{
return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
}
//+ - * / < > = ~ ! @ # % ^ & | ` ?
//There are a few restrictions on your choice of name:
// -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
// A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
// ~ ! @ # % ^ & | ` ?
// For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
// The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
//The operator != is mapped to <> on input, so these two names are always equivalent.
template <typename C>
inline bool isOperatorChar(C c)
{
return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
|| c == '|' || c == '`' || c == '?';
}
//typecast "::" IMPLEMENTED
//dot_dot \.\. TODO
//colon_equals ":=" TODO
//equals_greater "=>" TODO
//less_equals "<=" TODO
//greater_equals ">=" TODO
//less_greater "<>" TODO
//not_equals "!=" TODO
// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l
/**
* @brief NextBasicToken
* @param in
* @param ofs
* @param start
* @param length
* @return false when input seems invalid, it will return what it did recognize but something
* wasn't right, parser should try to recover
*/
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
{
// Basically chops based on white space
// it does also recognize comments and quoted strings/identifiers
while (true) {
startpos = m_pos;
QChar c = nextChar();
if (c == '\n') {
if (m_returnWhitespace) {
length = m_pos - startpos;
tokentype = BasicTokenType::NewLine;
out = "\n";
return true;
}
}
else if (c.isSpace()) {
// Just skip whitespace
if (m_returnWhitespace) {
for (;;) {
c = peekChar();
if (c != QChar::Null && c.isSpace() && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::WhiteSpace;
out = m_block.mid(startpos, length);
return true;
}
}
else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
// Loop till end of line or end of block
c = nextChar();
for (;;) {
c = peekChar();
if (c != QChar::Null && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Comment;
out = m_block.mid(startpos, length);
return true;
}
else if (c == ':') {
c = peekChar();
if (c == ':') {
nextChar();
length = m_pos - startpos;
tokentype = BasicTokenType::Cast;
out = m_block.mid(startpos, length);
return true;
}
}
else if (isSelf(c)) {
length = m_pos - startpos;
if (c == ',')
tokentype = BasicTokenType::Comma;
else
tokentype = BasicTokenType::Self;
out = m_block.mid(startpos, length);
return true;
}
else if (isOperatorChar(c)) {
while (true) {
QChar c = peekChar();
if (isOperatorChar(c)) {
nextChar();
}
else {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::Operator;
out = m_block.mid(startpos, length);
return true;
}
}
}
else if (c == '\'') {
// Single quoted string so it's an SQL text literal
if (parseSingleQuotedString(startpos, length, tokentype)) {
out = m_block.mid(startpos, length);
return true;
}
return false;
}
else if (c == '"') {
// Double quoted identifier
if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
out = m_block.mid(startpos, length);
return true;
}
return false;
}
else if (c == QChar::Null) {
length = 0;
tokentype = BasicTokenType::End;
return true;
}
else if (c == '$') {
return parseDollarQuote(startpos, length, tokentype, out);
}
else {
// Undetermined symbol
for (;;) {
c = peekChar();
if (c.isLetterOrNumber() || c == '_')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Symbol;
out = m_block.mid(startpos, length);
return true;
}
}
return false;
}
bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
nextChar();
if (c == '\'') {
// maybe end of string literal
if (peekChar() == '\'') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
}
}
}
bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
nextChar();
if (c == '"') {
// maybe end of string literal
if (peekChar() == '"') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
}
}
}
bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
{
QChar c = nextChar();
if (c.isDigit()) {
for (;;) {
c = peekChar();
if (c.isDigit())
nextChar();
else
break;
}
tokentype = BasicTokenType::Parameter;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return true;
}
if (c.isLetter()) {
while (true) {
c = nextChar();
if (c == '$') {
tokentype = BasicTokenType::DollarQuote;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return true;
}
if (!c.isLetter()) {
tokentype = BasicTokenType::None;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return false;
}
}
}
return false;
}