Lexer improvements:

- Convert unquoted idents to lowercase.
- Recognize quoted idents.
- Allow all unicode whitespace characters
- Added UnexpectedSymbol token for unexpected input (otherwise it is just ignored)
- Handle mixed case keywords in the lexer file instead of filtering the stream
This commit is contained in:
eelke 2022-04-03 20:09:58 +02:00
parent 81f27a6a18
commit 0da32b916c
6 changed files with 132 additions and 104 deletions

View file

@ -1,7 +1,7 @@
lexer grammar PgsqlLexer;
@lexer::preinclude {
#include <memory>
#include <QString>
}
@ -11,21 +11,61 @@ Dot: '.';
OpenParen: '(';
CloseParen: ')';
As: 'AS';
By: 'BY';
From: 'FROM';
Full: 'FULL';
Group: 'GROUP';
Having: 'HAVING';
Join: 'JOIN';
Left : 'LEFT';
Order : 'ORDER';
Right : 'RIGHT';
Select: 'SELECT';
Where: 'WHERE';
fragment A : 'a' | 'A';
fragment B : 'B' | 'b';
fragment C : 'C' | 'c';
fragment D : 'D' | 'd';
fragment E : 'E' | 'e';
fragment F : 'F' | 'f';
fragment G : 'G' | 'g';
fragment H : 'H' | 'h';
fragment I : 'I' | 'i';
fragment J : 'J' | 'j';
fragment K : 'K' | 'k';
fragment L : 'L' | 'l';
fragment M : 'M' | 'm';
fragment N : 'N' | 'n';
fragment O : 'O' | 'o';
fragment P : 'P' | 'p';
fragment Q : 'Q' | 'q';
fragment R : 'R' | 'r';
fragment S : 'S' | 's';
fragment T : 'T' | 't';
fragment U : 'U' | 'u';
fragment V : 'V' | 'v';
fragment W : 'W' | 'w';
fragment X : 'X' | 'x';
fragment Y : 'Y' | 'y';
fragment Z : 'Z' | 'z';
Ident: [A-Za-z_][A-Za-z_0-9]* ; // match lower-case identifiers
As: A S;
By: B Y;
From: F R O M;
Full: F U L L;
Group: G R O U P;
Having: H A V I N G;
Join: J O I N;
Left : L E F T;
Order : O R D E R;
Right : R I G H T;
Select: S E L E C T;
Where: W H E R E;
Ident: [\p{Alpha}]~[\p{White_Space}]*
{
setText(QString::fromStdString(getText()).toLower().toStdString());
}
| '"' ~["]+ '"'
{
{
std::string s = getText();
s = s.substr(1, s.length() - 2);
setText(s);
}
};
IntegerLiteral: [1-9][0-9]*;
StringLiteral: '\'' ('\'\'' | ~ ('\''))* '\'' { setText(getText().substr(1, getText().length()-2)); };
StringLiteral: '\'' ('\'\'' | ~['])+ '\'' { setText(getText().substr(1, getText().length()-2)); };
Whitespace : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines
Whitespace: [\p{White_Space}] -> skip ; // skip spaces, tabs, newlines
UnexpectedSymbol: .;