pgLab/core/SqlLexer.cpp
eelke 0cd019db92 Fix for $-quoted strings
Note this does not solve all issues because we are tokenizing contents of strings of which we do not know they contains SQL
when the string is actually not SQL and contains $ the tokenizer gets confused.
2023-01-07 07:41:58 +01:00

299 lines
8.6 KiB
C++

#include "SqlLexer.h"
SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
: m_block(std::move(block))
, m_state(currentstate)
, m_returnWhitespace(return_whitespace)
{}
QChar SqlLexer::nextChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos++);
}
return result;
}
QChar SqlLexer::peekChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos);
}
return result;
}
//self ,()\[\].;\:\+\-\*\/\%\^\<\>\=
template <typename C>
inline bool isSelf(C c)
{
return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
}
//+ - * / < > = ~ ! @ # % ^ & | ` ?
//There are a few restrictions on your choice of name:
// -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
// A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
// ~ ! @ # % ^ & | ` ?
// For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
// The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
//The operator != is mapped to <> on input, so these two names are always equivalent.
template <typename C>
inline bool isOperatorChar(C c)
{
return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
|| c == '|' || c == '`' || c == '?';
}
//typecast "::" IMPLEMENTED
//dot_dot \.\. TODO
//colon_equals ":=" TODO
//equals_greater "=>" TODO
//less_equals "<=" TODO
//greater_equals ">=" TODO
//less_greater "<>" TODO
//not_equals "!=" TODO
// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l
/**
* @brief NextBasicToken
* @param in
* @param ofs
* @param start
* @param length
* @return false when input seems invalid, it will return what it did recognize but something
* wasn't right, parser should try to recover
*/
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
{
// Basically chops based on white space
// it does also recognize comments and quoted strings/identifiers
while (true) {
startpos = m_pos;
QChar c = nextChar();
if (c == '\n') {
if (m_returnWhitespace) {
length = m_pos - startpos;
tokentype = BasicTokenType::NewLine;
out = "\n";
return true;
}
}
else if (c.isSpace()) {
// Just skip whitespace
if (m_returnWhitespace) {
for (;;) {
c = peekChar();
if (c != QChar::Null && c.isSpace() && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::WhiteSpace;
out = m_block.mid(startpos, length);
return true;
}
}
else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
// Loop till end of line or end of block
c = nextChar();
for (;;) {
c = peekChar();
if (c != QChar::Null && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Comment;
out = m_block.mid(startpos, length);
return true;
}
else if (c == ':') {
c = peekChar();
if (c == ':') {
nextChar();
length = m_pos - startpos;
tokentype = BasicTokenType::Cast;
out = m_block.mid(startpos, length);
return true;
}
}
else if (isSelf(c)) {
length = m_pos - startpos;
if (c == ',')
tokentype = BasicTokenType::Comma;
else
tokentype = BasicTokenType::Self;
out = m_block.mid(startpos, length);
return true;
}
else if (isOperatorChar(c)) {
while (true) {
QChar c = peekChar();
if (isOperatorChar(c)) {
nextChar();
}
else {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::Operator;
out = m_block.mid(startpos, length);
return true;
}
}
}
else if (c == '\'') {
// Single quoted string so it's an SQL text literal
if (parseSingleQuotedString(startpos, length, tokentype)) {
out = m_block.mid(startpos, length);
return true;
}
return false;
}
else if (c == '"') {
// Double quoted identifier
if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
out = m_block.mid(startpos, length);
return true;
}
return false;
}
else if (c == QChar::Null) {
length = 0;
tokentype = BasicTokenType::End;
return true;
}
else if (c == '$') {
return parseDollarQuote(startpos, length, tokentype, out);
}
else {
// Undetermined symbol
for (;;) {
c = peekChar();
if (c.isLetterOrNumber() || c == '_')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Symbol;
out = m_block.mid(startpos, length);
return true;
}
}
return false;
}
bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
nextChar();
if (c == '\'') {
// maybe end of string literal
if (peekChar() == '\'') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
}
}
}
bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
nextChar();
if (c == '"') {
// maybe end of string literal
if (peekChar() == '"') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
}
}
}
bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
{
QChar c = nextChar();
if (c == '$') {
tokentype = BasicTokenType::DollarQuote;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return true;
}
if (c.isDigit()) {
for (;;) {
c = peekChar();
if (c.isDigit())
nextChar();
else
break;
}
tokentype = BasicTokenType::Parameter;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return true;
}
if (c.isLetter()) {
while (true) {
c = nextChar();
if (c == '$') {
tokentype = BasicTokenType::DollarQuote;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return true;
}
if (!c.isLetter()) {
tokentype = BasicTokenType::None;
length = m_pos - startpos;
out = m_block.mid(startpos, length);
return false;
}
}
}
return false;
}