From 0da32b916c515ceec0c6a0281102c74b86a1805e Mon Sep 17 00:00:00 2001 From: eelke Date: Sun, 3 Apr 2022 20:09:58 +0200 Subject: [PATCH] Lexer improvements: - Convert unquoted idents to lowercase. - Recognize quoted idents. - Allow all unicode whitespace characters - Added UnexpectedSymbol token for unexpected input (otherwise it is just ignored) - Handle mixed case keywords in the lexer file instead of filtering the stream --- pglablib/CaseChangingCharStream.h | 83 ------------------------------ pglablib/PgsqlLexer.g4 | 72 ++++++++++++++++++++------ pglablib/pglablib.pro | 1 - pglablib/sqlparser/Parser.cpp | 3 +- pglablib/sqlparser/Parser.h | 2 - tests/pglabtests/tst_newParser.cpp | 75 +++++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 104 deletions(-) delete mode 100644 pglablib/CaseChangingCharStream.h diff --git a/pglablib/CaseChangingCharStream.h b/pglablib/CaseChangingCharStream.h deleted file mode 100644 index 7a648c3..0000000 --- a/pglablib/CaseChangingCharStream.h +++ /dev/null @@ -1,83 +0,0 @@ -#pragma once - -#include "antlr4-runtime.h" -#include - -/// Helper stream for antlr, the lexer does not need to base case sensitive -/// this is achieved by changing the case of the chars in LA how ever -/// when the text of a recognized token is captured the getText function -/// is used which does no case conversion so the parse will receive the original -/// case. -class CaseChangingCharStream: public antlr4::CharStream -{ -public: - CaseChangingCharStream(antlr4::CharStream *stream, bool upper) - : stream(stream) - , upper(upper) - {} - - virtual ~CaseChangingCharStream() - {} - - virtual void consume() override - { - stream->consume(); - } - - virtual size_t LA(ssize_t i) override - { - int c = stream->LA(i); - if (c <= 0) - return c; - - if (upper) - return QChar::toUpper(c); - - return QChar::toLower(c); - } - - virtual std::string getText(const antlr4::misc::Interval &interval) override - { - return stream->getText(interval); - } - - virtual std::string toString() const override - { - return stream->toString(); - } - - virtual ssize_t mark() override - { - return stream->mark(); - } - - virtual void release(ssize_t marker) override - { - stream->release(marker); - } - - virtual size_t index() override - { - return stream->index(); - } - - virtual void seek(size_t index) override - { - stream->seek(index); - } - - virtual size_t size() override - { - return stream->size(); - } - - virtual std::string getSourceName() const override - { - return stream->getSourceName(); - } - -private: - antlr4::CharStream *stream; - bool upper; - -}; diff --git a/pglablib/PgsqlLexer.g4 b/pglablib/PgsqlLexer.g4 index 699dae0..55e2ce3 100644 --- a/pglablib/PgsqlLexer.g4 +++ b/pglablib/PgsqlLexer.g4 @@ -1,7 +1,7 @@ lexer grammar PgsqlLexer; @lexer::preinclude { -#include +#include } @@ -11,21 +11,61 @@ Dot: '.'; OpenParen: '('; CloseParen: ')'; -As: 'AS'; -By: 'BY'; -From: 'FROM'; -Full: 'FULL'; -Group: 'GROUP'; -Having: 'HAVING'; -Join: 'JOIN'; -Left : 'LEFT'; -Order : 'ORDER'; -Right : 'RIGHT'; -Select: 'SELECT'; -Where: 'WHERE'; +fragment A : 'a' | 'A'; +fragment B : 'B' | 'b'; +fragment C : 'C' | 'c'; +fragment D : 'D' | 'd'; +fragment E : 'E' | 'e'; +fragment F : 'F' | 'f'; +fragment G : 'G' | 'g'; +fragment H : 'H' | 'h'; +fragment I : 'I' | 'i'; +fragment J : 'J' | 'j'; +fragment K : 'K' | 'k'; +fragment L : 'L' | 'l'; +fragment M : 'M' | 'm'; +fragment N : 'N' | 'n'; +fragment O : 'O' | 'o'; +fragment P : 'P' | 'p'; +fragment Q : 'Q' | 'q'; +fragment R : 'R' | 'r'; +fragment S : 'S' | 's'; +fragment T : 'T' | 't'; +fragment U : 'U' | 'u'; +fragment V : 'V' | 'v'; +fragment W : 'W' | 'w'; +fragment X : 'X' | 'x'; +fragment Y : 'Y' | 'y'; +fragment Z : 'Z' | 'z'; -Ident: [A-Za-z_][A-Za-z_0-9]* ; // match lower-case identifiers +As: A S; +By: B Y; +From: F R O M; +Full: F U L L; +Group: G R O U P; +Having: H A V I N G; +Join: J O I N; +Left : L E F T; +Order : O R D E R; +Right : R I G H T; +Select: S E L E C T; +Where: W H E R E; + +Ident: [\p{Alpha}]~[\p{White_Space}]* + { + setText(QString::fromStdString(getText()).toLower().toStdString()); + } + | '"' ~["]+ '"' + { + { + std::string s = getText(); + s = s.substr(1, s.length() - 2); + setText(s); + } + }; IntegerLiteral: [1-9][0-9]*; -StringLiteral: '\'' ('\'\'' | ~ ('\''))* '\'' { setText(getText().substr(1, getText().length()-2)); }; +StringLiteral: '\'' ('\'\'' | ~['])+ '\'' { setText(getText().substr(1, getText().length()-2)); }; -Whitespace : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines \ No newline at end of file +Whitespace: [\p{White_Space}] -> skip ; // skip spaces, tabs, newlines + +UnexpectedSymbol: .; \ No newline at end of file diff --git a/pglablib/pglablib.pro b/pglablib/pglablib.pro index 32608f8..974b722 100644 --- a/pglablib/pglablib.pro +++ b/pglablib/pglablib.pro @@ -104,7 +104,6 @@ SOURCES += \ catalog/PgSequenceContainer.cpp HEADERS += \ - CaseChangingCharStream.h \ Pglablib.h \ ASyncDBConnection.h \ ConnectionConfig.h \ diff --git a/pglablib/sqlparser/Parser.cpp b/pglablib/sqlparser/Parser.cpp index 92e6019..a6da16f 100644 --- a/pglablib/sqlparser/Parser.cpp +++ b/pglablib/sqlparser/Parser.cpp @@ -8,8 +8,7 @@ Parser::Parser(const std::string &input_string) Parser::Parser(std::unique_ptr stream) : InputStream(std::move(stream)) - , CaseFilter(InputStream.get(), true) - , Lexer(&CaseFilter) + , Lexer(InputStream.get()) , TokenStream(&Lexer) , AParser(&TokenStream) { diff --git a/pglablib/sqlparser/Parser.h b/pglablib/sqlparser/Parser.h index 7633252..173d354 100644 --- a/pglablib/sqlparser/Parser.h +++ b/pglablib/sqlparser/Parser.h @@ -2,7 +2,6 @@ #include ".generated/PgsqlLexer.h" #include ".generated/PgsqlParser.h" -#include "CaseChangingCharStream.h" #include "ErrorListener.h" class Parser @@ -19,7 +18,6 @@ public: } private: std::unique_ptr InputStream; - CaseChangingCharStream CaseFilter; PgsqlLexer Lexer; antlr4::CommonTokenStream TokenStream; PgsqlParser AParser; diff --git a/tests/pglabtests/tst_newParser.cpp b/tests/pglabtests/tst_newParser.cpp index f19bd0c..b8fd06e 100644 --- a/tests/pglabtests/tst_newParser.cpp +++ b/tests/pglabtests/tst_newParser.cpp @@ -6,6 +6,37 @@ using namespace testing; using namespace sqlast; +TEST(NewSqlLexer, Select) +{ + std::string source = "SELECT"; + antlr4::ANTLRInputStream input(source); + PgsqlLexer lexer(&input); + + auto token = lexer.nextToken(); + ASSERT_EQ(PgsqlLexer::Select, token->getType()); +} + +TEST(NewSqlLexer, Ident) +{ + std::string source = "Abc"; + antlr4::ANTLRInputStream input(source); + PgsqlLexer lexer(&input); + + auto token = lexer.nextToken(); + ASSERT_EQ(PgsqlLexer::Ident, token->getType()); + ASSERT_EQ("abc", token->getText()); +} + +TEST(NewSqlLexer, QuotedIdent) +{ + std::string source = "\"Abc\""; + antlr4::ANTLRInputStream input(source); + PgsqlLexer lexer(&input); + + auto token = lexer.nextToken(); + ASSERT_EQ(PgsqlLexer::Ident, token->getType()); + ASSERT_EQ("Abc", token->getText()); +} TEST(NewSqlParser, statementList) @@ -47,3 +78,47 @@ TEST(NewSqlParser, selectList) StringLiteral& string_literal = dynamic_cast(si.GetExpression()); ASSERT_EQ("Tekst", string_literal.GetValue()); } + +TEST(NewSqlParser, selectAliasWithoutAs) +{ + std::string input_string = "SELECT 1 a"; + Parser parser(input_string); + std::unique_ptr program = parser.Parse(); + + ASSERT_TRUE(program != nullptr); + ASSERT_EQ(1, program->Count()); + ASSERT_EQ(0, parser.errorCount()); + + SelectStatement &s = dynamic_cast(program->Get(0)); + SelectList* sl = s.GetSelectList(); + SelectItem& si = sl->Get(0); + ASSERT_EQ("a", si.GetAlias()); +} + + +TEST(NewSqlParser, selectAliasWithAs) +{ + std::string input_string = "SELECT 1 AS b"; + Parser parser(input_string); + std::unique_ptr program = parser.Parse(); + + ASSERT_TRUE(program != nullptr); + ASSERT_EQ(1, program->Count()); + ASSERT_EQ(0, parser.errorCount()); + + SelectStatement &s = dynamic_cast(program->Get(0)); + SelectList* sl = s.GetSelectList(); + SelectItem& si = sl->Get(0); + ASSERT_EQ("b", si.GetAlias()); +} + +TEST(NewSqlParser, selectFrom) +{ + std::string input_string = "SELECT 1 FROM a"; + Parser parser(input_string); + std::unique_ptr program = parser.Parse(); + + ASSERT_TRUE(program != nullptr); + ASSERT_EQ(1, program->Count()); + ASSERT_EQ(0, parser.errorCount()); +}