Lexer improvements:

- Convert unquoted idents to lowercase.
- Recognize quoted idents.
- Allow all unicode whitespace characters
- Added UnexpectedSymbol token for unexpected input (otherwise it is just ignored)
- Handle mixed case keywords in the lexer file instead of filtering the stream
This commit is contained in:
eelke 2022-04-03 20:09:58 +02:00
parent 81f27a6a18
commit 0da32b916c
6 changed files with 132 additions and 104 deletions

View file

@ -1,83 +0,0 @@
#pragma once
#include "antlr4-runtime.h"
#include <QChar>
/// Helper stream for antlr, the lexer does not need to base case sensitive
/// this is achieved by changing the case of the chars in LA how ever
/// when the text of a recognized token is captured the getText function
/// is used which does no case conversion so the parse will receive the original
/// case.
class CaseChangingCharStream: public antlr4::CharStream
{
public:
CaseChangingCharStream(antlr4::CharStream *stream, bool upper)
: stream(stream)
, upper(upper)
{}
virtual ~CaseChangingCharStream()
{}
virtual void consume() override
{
stream->consume();
}
virtual size_t LA(ssize_t i) override
{
int c = stream->LA(i);
if (c <= 0)
return c;
if (upper)
return QChar::toUpper(c);
return QChar::toLower(c);
}
virtual std::string getText(const antlr4::misc::Interval &interval) override
{
return stream->getText(interval);
}
virtual std::string toString() const override
{
return stream->toString();
}
virtual ssize_t mark() override
{
return stream->mark();
}
virtual void release(ssize_t marker) override
{
stream->release(marker);
}
virtual size_t index() override
{
return stream->index();
}
virtual void seek(size_t index) override
{
stream->seek(index);
}
virtual size_t size() override
{
return stream->size();
}
virtual std::string getSourceName() const override
{
return stream->getSourceName();
}
private:
antlr4::CharStream *stream;
bool upper;
};

View file

@ -1,7 +1,7 @@
lexer grammar PgsqlLexer;
@lexer::preinclude {
#include <memory>
#include <QString>
}
@ -11,21 +11,61 @@ Dot: '.';
OpenParen: '(';
CloseParen: ')';
As: 'AS';
By: 'BY';
From: 'FROM';
Full: 'FULL';
Group: 'GROUP';
Having: 'HAVING';
Join: 'JOIN';
Left : 'LEFT';
Order : 'ORDER';
Right : 'RIGHT';
Select: 'SELECT';
Where: 'WHERE';
fragment A : 'a' | 'A';
fragment B : 'B' | 'b';
fragment C : 'C' | 'c';
fragment D : 'D' | 'd';
fragment E : 'E' | 'e';
fragment F : 'F' | 'f';
fragment G : 'G' | 'g';
fragment H : 'H' | 'h';
fragment I : 'I' | 'i';
fragment J : 'J' | 'j';
fragment K : 'K' | 'k';
fragment L : 'L' | 'l';
fragment M : 'M' | 'm';
fragment N : 'N' | 'n';
fragment O : 'O' | 'o';
fragment P : 'P' | 'p';
fragment Q : 'Q' | 'q';
fragment R : 'R' | 'r';
fragment S : 'S' | 's';
fragment T : 'T' | 't';
fragment U : 'U' | 'u';
fragment V : 'V' | 'v';
fragment W : 'W' | 'w';
fragment X : 'X' | 'x';
fragment Y : 'Y' | 'y';
fragment Z : 'Z' | 'z';
Ident: [A-Za-z_][A-Za-z_0-9]* ; // match lower-case identifiers
As: A S;
By: B Y;
From: F R O M;
Full: F U L L;
Group: G R O U P;
Having: H A V I N G;
Join: J O I N;
Left : L E F T;
Order : O R D E R;
Right : R I G H T;
Select: S E L E C T;
Where: W H E R E;
Ident: [\p{Alpha}]~[\p{White_Space}]*
{
setText(QString::fromStdString(getText()).toLower().toStdString());
}
| '"' ~["]+ '"'
{
{
std::string s = getText();
s = s.substr(1, s.length() - 2);
setText(s);
}
};
IntegerLiteral: [1-9][0-9]*;
StringLiteral: '\'' ('\'\'' | ~ ('\''))* '\'' { setText(getText().substr(1, getText().length()-2)); };
StringLiteral: '\'' ('\'\'' | ~['])+ '\'' { setText(getText().substr(1, getText().length()-2)); };
Whitespace : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines
Whitespace: [\p{White_Space}] -> skip ; // skip spaces, tabs, newlines
UnexpectedSymbol: .;

View file

@ -104,7 +104,6 @@ SOURCES += \
catalog/PgSequenceContainer.cpp
HEADERS += \
CaseChangingCharStream.h \
Pglablib.h \
ASyncDBConnection.h \
ConnectionConfig.h \

View file

@ -8,8 +8,7 @@ Parser::Parser(const std::string &input_string)
Parser::Parser(std::unique_ptr<antlr4::CharStream> stream)
: InputStream(std::move(stream))
, CaseFilter(InputStream.get(), true)
, Lexer(&CaseFilter)
, Lexer(InputStream.get())
, TokenStream(&Lexer)
, AParser(&TokenStream)
{

View file

@ -2,7 +2,6 @@
#include ".generated/PgsqlLexer.h"
#include ".generated/PgsqlParser.h"
#include "CaseChangingCharStream.h"
#include "ErrorListener.h"
class Parser
@ -19,7 +18,6 @@ public:
}
private:
std::unique_ptr<antlr4::CharStream> InputStream;
CaseChangingCharStream CaseFilter;
PgsqlLexer Lexer;
antlr4::CommonTokenStream TokenStream;
PgsqlParser AParser;