Note this does not solve all issues because we are tokenizing contents of strings of which we do not know they contains SQL when the string is actually not SQL and contains $ the tokenizer gets confused.
299 lines
8.6 KiB
C++
299 lines
8.6 KiB
C++
#include "SqlLexer.h"
|
|
|
|
SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
|
|
: m_block(std::move(block))
|
|
, m_state(currentstate)
|
|
, m_returnWhitespace(return_whitespace)
|
|
{}
|
|
|
|
QChar SqlLexer::nextChar()
|
|
{
|
|
QChar result = QChar::Null;
|
|
if (m_pos < m_block.size()) {
|
|
result = m_block.at(m_pos++);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
QChar SqlLexer::peekChar()
|
|
{
|
|
QChar result = QChar::Null;
|
|
if (m_pos < m_block.size()) {
|
|
result = m_block.at(m_pos);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
|
|
//self ,()\[\].;\:\+\-\*\/\%\^\<\>\=
|
|
template <typename C>
|
|
inline bool isSelf(C c)
|
|
{
|
|
return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
|
|
|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
|
|
|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
|
|
}
|
|
|
|
|
|
//+ - * / < > = ~ ! @ # % ^ & | ` ?
|
|
//There are a few restrictions on your choice of name:
|
|
// -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
|
|
// A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
|
|
// ~ ! @ # % ^ & | ` ?
|
|
// For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
|
|
// The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
|
|
//The operator != is mapped to <> on input, so these two names are always equivalent.
|
|
template <typename C>
|
|
inline bool isOperatorChar(C c)
|
|
{
|
|
return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
|
|
|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
|
|
|| c == '|' || c == '`' || c == '?';
|
|
}
|
|
|
|
//typecast "::" IMPLEMENTED
|
|
//dot_dot \.\. TODO
|
|
//colon_equals ":=" TODO
|
|
//equals_greater "=>" TODO
|
|
//less_equals "<=" TODO
|
|
//greater_equals ">=" TODO
|
|
//less_greater "<>" TODO
|
|
//not_equals "!=" TODO
|
|
|
|
// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l
|
|
|
|
/**
|
|
* @brief NextBasicToken
|
|
* @param in
|
|
* @param ofs
|
|
* @param start
|
|
* @param length
|
|
* @return false when input seems invalid, it will return what it did recognize but something
|
|
* wasn't right, parser should try to recover
|
|
*/
|
|
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
|
|
{
|
|
// Basically chops based on white space
|
|
// it does also recognize comments and quoted strings/identifiers
|
|
while (true) {
|
|
startpos = m_pos;
|
|
QChar c = nextChar();
|
|
|
|
if (c == '\n') {
|
|
if (m_returnWhitespace) {
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::NewLine;
|
|
out = "\n";
|
|
return true;
|
|
}
|
|
}
|
|
else if (c.isSpace()) {
|
|
// Just skip whitespace
|
|
if (m_returnWhitespace) {
|
|
for (;;) {
|
|
c = peekChar();
|
|
if (c != QChar::Null && c.isSpace() && c != '\n')
|
|
nextChar();
|
|
else
|
|
break;
|
|
}
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::WhiteSpace;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
}
|
|
else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
|
|
// Loop till end of line or end of block
|
|
c = nextChar();
|
|
for (;;) {
|
|
c = peekChar();
|
|
if (c != QChar::Null && c != '\n')
|
|
nextChar();
|
|
else
|
|
break;
|
|
}
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::Comment;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
else if (c == ':') {
|
|
c = peekChar();
|
|
if (c == ':') {
|
|
nextChar();
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::Cast;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
}
|
|
else if (isSelf(c)) {
|
|
length = m_pos - startpos;
|
|
if (c == ',')
|
|
tokentype = BasicTokenType::Comma;
|
|
else
|
|
tokentype = BasicTokenType::Self;
|
|
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
else if (isOperatorChar(c)) {
|
|
while (true) {
|
|
QChar c = peekChar();
|
|
if (isOperatorChar(c)) {
|
|
nextChar();
|
|
}
|
|
else {
|
|
// unexpected end, pretend nothings wrong
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::Operator;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
else if (c == '\'') {
|
|
// Single quoted string so it's an SQL text literal
|
|
if (parseSingleQuotedString(startpos, length, tokentype)) {
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
else if (c == '"') {
|
|
// Double quoted identifier
|
|
if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
else if (c == QChar::Null) {
|
|
length = 0;
|
|
tokentype = BasicTokenType::End;
|
|
return true;
|
|
}
|
|
else if (c == '$') {
|
|
return parseDollarQuote(startpos, length, tokentype, out);
|
|
}
|
|
else {
|
|
// Undetermined symbol
|
|
for (;;) {
|
|
c = peekChar();
|
|
if (c.isLetterOrNumber() || c == '_')
|
|
nextChar();
|
|
else
|
|
break;
|
|
}
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::Symbol;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
|
|
{
|
|
while (true) {
|
|
QChar c = peekChar();
|
|
if (c == QChar::Null || c == '\n') {
|
|
// unexpected end, pretend nothings wrong
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::QuotedString;
|
|
return true;
|
|
}
|
|
|
|
nextChar();
|
|
if (c == '\'') {
|
|
// maybe end of string literal
|
|
if (peekChar() == '\'') {
|
|
// Nope, just double quote to escape quote
|
|
nextChar(); // eat it
|
|
}
|
|
else {
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::QuotedString;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
|
|
{
|
|
while (true) {
|
|
QChar c = peekChar();
|
|
if (c == QChar::Null || c == '\n') {
|
|
// unexpected end, pretend nothings wrong
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::QuotedIdentifier;
|
|
return true;
|
|
}
|
|
|
|
nextChar();
|
|
if (c == '"') {
|
|
// maybe end of string literal
|
|
if (peekChar() == '"') {
|
|
// Nope, just double quote to escape quote
|
|
nextChar(); // eat it
|
|
}
|
|
else {
|
|
length = m_pos - startpos;
|
|
tokentype = BasicTokenType::QuotedIdentifier;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
|
|
{
|
|
QChar c = nextChar();
|
|
if (c == '$') {
|
|
tokentype = BasicTokenType::DollarQuote;
|
|
length = m_pos - startpos;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
|
|
if (c.isDigit()) {
|
|
for (;;) {
|
|
c = peekChar();
|
|
if (c.isDigit())
|
|
nextChar();
|
|
else
|
|
break;
|
|
}
|
|
tokentype = BasicTokenType::Parameter;
|
|
length = m_pos - startpos;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
|
|
if (c.isLetter()) {
|
|
while (true) {
|
|
c = nextChar();
|
|
if (c == '$') {
|
|
tokentype = BasicTokenType::DollarQuote;
|
|
length = m_pos - startpos;
|
|
out = m_block.mid(startpos, length);
|
|
return true;
|
|
}
|
|
|
|
if (!c.isLetter()) {
|
|
tokentype = BasicTokenType::None;
|
|
length = m_pos - startpos;
|
|
out = m_block.mid(startpos, length);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|