pgLab/core/SqlLexer.cpp
eelke 48ac8c6bab Improved generation of c/cpp string from query
Extra lines before and after query are removed. Whitespace at end of line
is removed. SQL comments are converted to cpp style comments and are outside
the string literal.

To achieve this the function now uses the SQLLexer to know what is comment.
This also required the additional capability in the lexer to also return whitespace and newline tokens.
Also a few bugs in the lexer were fixed.
2019-08-19 13:52:23 +02:00

334 lines
8.7 KiB
C++

#include "SqlLexer.h"
SqlLexer::SqlLexer(QString block, LexerState currentstate, bool return_whitespace)
: m_block(std::move(block))
, m_state(currentstate)
, m_returnWhitespace(return_whitespace)
{}
QChar SqlLexer::nextChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos++);
}
return result;
}
QChar SqlLexer::peekChar()
{
QChar result = QChar::Null;
if (m_pos < m_block.size()) {
result = m_block.at(m_pos);
}
return result;
}
//self ,()\[\].;\:\+\-\*\/\%\^\<\>\=
template <typename C>
inline bool isSelf(C c)
{
return c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '.'
|| c == ';' || c == ':' || c == '+' || c == '-' || c == '*' || c == '/'
|| c == '%' || c == '^' || c == '<' || c == '>' || c == '=';
}
//+ - * / < > = ~ ! @ # % ^ & | ` ?
//There are a few restrictions on your choice of name:
// -- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
// A multicharacter operator name cannot end in + or -, unless the name also contains at least one of these characters:
// ~ ! @ # % ^ & | ` ?
// For example, @- is an allowed operator name, but *- is not. This restriction allows PostgreSQL to parse SQL-compliant commands without requiring spaces between tokens.
// The use of => as an operator name is deprecated. It may be disallowed altogether in a future release.
//The operator != is mapped to <> on input, so these two names are always equivalent.
template <typename C>
inline bool isOperatorChar(C c)
{
return c == '+' || c == '-' || c == '*' || c == '/' || c == '<' || c == '>' || c == '='
|| c == '~' || c == '!' || c == '@' || c == '#' || c == '%' || c == '^' || c == '&'
|| c == '|' || c == '`' || c == '?';
}
//typecast "::" IMPLEMENTED
//dot_dot \.\. TODO
//colon_equals ":=" TODO
//equals_greater "=>" TODO
//less_equals "<=" TODO
//greater_equals ">=" TODO
//less_greater "<>" TODO
//not_equals "!=" TODO
// See also C:\Prog\postgresql-9.6.4\src\backend\parser\main.l
/**
* @brief NextBasicToken
* @param in
* @param ofs
* @param start
* @param length
* @return false when input seems invalid, it will return what it did recognize but something
* wasn't right, parser should try to recover
*/
bool SqlLexer::nextBasicToken(int &startpos, int &length, BasicTokenType &tokentype, QString &out)
{
// Basically chops based on white space
// it does also recognize comments and quoted strings/identifiers
while (true) {
startpos = m_pos;
QChar c = nextChar();
// if (LexerState::Null == m_state) {
if (c == '\n') {
if (m_returnWhitespace) {
length = m_pos - startpos;
tokentype = BasicTokenType::NewLine;
out = "\n";
return true;
}
}
else if (c.isSpace()) {
// Just skip whitespace
if (m_returnWhitespace) {
for (;;) {
c = peekChar();
if (c != QChar::Null && c.isSpace() && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::WhiteSpace;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
}
else if (c == '-' && peekChar() == '-') { // two dashes, start of comment
// Loop till end of line or end of block
c = nextChar();
for (;;) {
c = peekChar();
if (c != QChar::Null && c != '\n')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Comment;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
else if (c == ':') {
c = peekChar();
if (c == ':') {
nextChar();
length = m_pos - startpos;
tokentype = BasicTokenType::Cast;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
}
else if (isSelf(c)) {
length = m_pos - startpos;
if (c == ',')
tokentype = BasicTokenType::Comma;
else
tokentype = BasicTokenType::Self;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
else if (isOperatorChar(c)) {
while (true) {
QChar c = peekChar();
if (isOperatorChar(c)) {
nextChar();
}
else {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::Operator;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
}
}
else if (c == '\'') {
// Single quoted string so it's an SQL text literal
if (parseSingleQuotedString(startpos, length, tokentype)) {
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
return false;
}
else if (c == '"') {
// Double quoted identifier
if (parseDoubleQuotedIdentifier(startpos, length, tokentype)) {
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
return false;
}
// else if (c == '/' && peekChar() == '*') {
// nextChar();
// m_state = LexerState::InBlockComment;
// }
else if (c == QChar::Null) {
length = 0;
tokentype = BasicTokenType::End;
return true;
}
else if (c == '$') {
return parseDollarQuote(startpos, length, tokentype, out);
}
else {
// Undetermined symbol
for (;;) {
c = peekChar();
if (c.isLetterOrNumber() || c == '_')
nextChar();
else
break;
}
length = m_pos - startpos;
tokentype = BasicTokenType::Symbol;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
// }
// else if (LexerState::InBlockComment == m_state) {
// if (c == QChar::Null) {
// // eof current buffer, we need to return state so
// if (m_pos == startpos) {
// break;
// }
// else {
// length = m_pos - startpos;
// tokentype = BasicTokenType::OpenBlockComment;
// return true;
// }
// }
// else if (c == '*') {
// nextChar();
// if (peekChar() == '/') {
// nextChar();
// length = m_pos - startpos;
// tokentype = BasicTokenType::BlockComment;
// m_state = LexerState::Null;
// return true;
// }
// }
// }
}
return false;
}
bool SqlLexer::parseSingleQuotedString(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
nextChar();
if (c == '\'') {
// maybe end of string literal
if (peekChar() == '\'') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedString;
return true;
}
}
}
}
bool SqlLexer::parseDoubleQuotedIdentifier(int startpos, int &length, BasicTokenType &tokentype)
{
while (true) {
QChar c = peekChar();
if (c == QChar::Null || c == '\n') {
// unexpected end, pretend nothings wrong
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
nextChar();
if (c == '"') {
// maybe end of string literal
if (peekChar() == '"') {
// Nope, just double quote to escape quote
nextChar(); // eat it
}
else {
length = m_pos - startpos;
tokentype = BasicTokenType::QuotedIdentifier;
return true;
}
}
}
}
bool SqlLexer::parseDollarQuote(int startpos, int &length, BasicTokenType &tokentype, QString &out)
{
QChar c = nextChar();
if (c.isDigit()) {
for (;;) {
c = peekChar();
if (c.isDigit())
nextChar();
else
break;
}
tokentype = BasicTokenType::Parameter;
length = m_pos - startpos;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
if (c.isLetter()) {
// is this a dollar quote?
while (true) {
c = nextChar();
if (c == '$') {
// Found valid dollar quote
tokentype = BasicTokenType::DollarQuote;
length = m_pos - startpos;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return true;
}
if (!c.isLetter()) {
// ERROR, unallowed character
tokentype = BasicTokenType::None;
length = m_pos - startpos;
QStringRef sr(&m_block, startpos, length);
out = sr.toString();
return false;
}
}
}
return false;
}