change readme and add new idea for lexer

This commit is contained in:
Aleksey Chichenkov 2019-01-25 14:25:21 +03:00
parent 04420a5f9f
commit 2374e2b5d1
4 changed files with 268 additions and 203 deletions

View File

@ -1,6 +1,7 @@
# re2-js-generator # re2-js-generator
### Install guide ### Install guide
You need install npm and node v.10;
npm install npm install
@ -12,8 +13,10 @@ Read re2c manual: http://re2c.org/manual/manual.html
or or
node parse_source_lexeme.js node main.js
Result fill write in <lexer.js> file.
Result fill write in out.js file.
Report for bugs: rolahd@yandex.ru Report for bugs: rolahd@yandex.ru

440
lexer.l
View File

@ -1,190 +1,255 @@
var lex = [ var types = [
"ERR", "LSB",
"DELIM", "RSB",
"LCB", "LCB",
"RCB", "RCB",
"INTEGER", "COLON",
"COMMA",
"DOT",
"REM",
"GT",
"GTE",
"LT",
"LTE",
"EQ",
"NEQ",
"LIKE",
"NLIKE",
"AND",
"OR",
"NOT",
"ADDRESS",
"TIME",
"TIMEDIFF",
"INTEGER_LITERAL",
"FLOAT_LITERAL", "FLOAT_LITERAL",
"BOOL_LITERAL", "BOOL_LITERAL",
"INTEGER_LITERAL", "ID"
]; ];
var searchString = function (_str, _quote, _yylexstart, _yycursor) { var errors = {
var found_back_slash = false; "-2": "not found close quote or singleQuote",
_yycursor++; "-1": "not found any lexemes or errors or anything else",
while(_yycursor < _str.length){ "0": "success",
var char = _str[_yycursor]; "1": "found unknown symbol"
if(_quote == '"') {
switch (char) {
case "\\":
found_back_slash = true;
break;
case '"':
if(!found_back_slash) {
return { success: true, pos: _yycursor + 1 }
}
found_back_slash = false;
break;
}
} else if(_quote == "'") {
switch (char) {
case "\\":
found_back_slash = true;
break;
case "'":
if(!found_back_slash) {
return { success: true, pos: _yycursor + 1 }
}
found_back_slash = false;
break;
}
}
_yycursor++;
}
return {success: false, pos: _yycursor + 1 }
}; };
var addLexeme = function(_str, _yylexstart, _yycursor, _lexeme) { var Lexer = function(_string){
if(_lexeme !== "ERR"){ this._last_found_lexeme = {error: -1};
console.log(print_f("found lex: %s; start: %s; end: %s; result => %s", _lexeme, _yylexstart, _yycursor, _str.substring(_yylexstart, _yycursor))); this._end = false;
} else { this._error = false;
console.log(print_f("search end\n")); this._string = _string;
return true; this._state = 1;
} this._yy_char = null;
this._yy_lex_start = 0;
this._yy_cursor = 0;
this._yy_marker = 0;
this._yy_accept = 0;
}; };
var unknownSymbol = function(_str, _yylexstart, _yycursor){ Lexer.prototype = {
throw print_f("Found unknown symbol on position: %s", _yycursor) types: types,
}; errors: errors,
_notFoundCloseQuote: function() {
this._error = true;
this._last_found_lexeme = {
error: 2,
start: this._yy_lex_start,
end: this._yy_cursor
};
var notFoundCloseQuote = function(_str, _yylexstart, _yycursor) { console.log( print_f("Not found close quote start: %s", this._yy_cursor));
console.log( print_f("Not found close quote start: %s", _yycursor)); },
throw print_f("Not found close quote start: %s", _yycursor); _unknownSymbol: function(){
}; this._error = true;
this._last_found_lexeme = {
var start_search = function(_str) { error: 1,
console.log("start search", _str); start: this._yy_lex_start,
var id = 1; end: this._yy_cursor
var yych = null;
var YYLEXSTART = 0;
var yyaccept = 0;
var YYCURSOR = 0;
var YYMARKER = 0;
var str = _str;
var reset = function(){
yyaccept = 0;
id = 1;
YYLEXSTART = YYCURSOR;
YYMARKER = YYCURSOR;
}; };
while(true) console.log( print_f("Found unknown symbol on position: %s", this._yy_cursor));
{ },
switch(id) /*!re2c _foundLexeme: function(_lexeme) {
re2c:define:YYCTYPE = _r2c_var_; console.log(print_f("found lex: %s; start: %s; end: %s; result => %s", _lexeme, this._yy_lex_start, this._yy_cursor, this._string.substring(this._yy_lex_start, this._yy_cursor)));
re2c:yyfill:enable = 0; this._last_found_lexeme = {
error: 0,
lexeme: _lexeme,
start: this._yy_lex_start,
end: this._yy_cursor
};
},
_endOfString: function(){
console.log(print_f("search end\n"));
this._end = true;
this._last_found_lexeme = {
error: -2
};
},
_searchString: function () {
var _quote = this._string[this._yy_cursor - 1];
var found_back_slash = false;
while(this._yy_cursor < this._string.length){
this._yy_char = this._string[this._yy_cursor];
if(_quote == '"') {
switch (this._yy_char) {
case "\\":
found_back_slash = true;
break;
case '"':
if(!found_back_slash) {
this._yy_cursor++;
this._foundLexeme("STRING_LITERAL");
return;
}
found_back_slash = false;
break;
}
} else if(_quote == "'") {
switch (this._yy_char) {
case "\\":
found_back_slash = true;
break;
case "'":
if(!found_back_slash) {
this._yy_cursor++;
this._foundLexeme("STRING_LITERAL");
return;
}
found_back_slash = false;
break;
}
}
this._yy_cursor++;
}
D = [0-9]; this._notFoundCloseQuote();
end = "\x00"; },
L = [A-Za-z_]; _set_next: function(){
RL = [\U00000400-\U00000451]; this._yy_accept = 0;
this._state = 1;
this._yy_lex_start = this._yy_cursor;
this._yy_marker = this._yy_cursor;
},
next: function(){
if(this._end || this._error) return null;
this.search();
return this.token();
},
token: function(){
return this._last_found_lexeme;
},
search: function(){
if(this._end) return false;
CR = "\r"; while(true){
LF = "\n"; switch(id) /*!re2c
CRLF = CR?LF; re2c:define:YYCTYPE = _r2c_var_;
INTEGER = "-"?D+; re2c:define:YYCURSOR = this._yy_cursor;
SP = " "; re2c:define:YYMARKER = this._yy_marker;
TAB = "\t"; re2c:yyfill:enable = 0;
DELIM = SP|TAB|CR|LF; D = [0-9];
end = "\x00";
L = [A-Za-z_];
RL = [\U00000400-\U00000451];
LSB = "["; CR = "\r";
RSB = "]"; LF = "\n";
LCB = "("; CRLF = CR?LF;
RCB = ")"; INTEGER = "-"?D+;
COLON = ":"; SP = " ";
COMMA = ","; TAB = "\t";
DOT = ".";
REM = "%";
GT = ">";
GTE = ">=";
LT = "<";
LTE = "<=";
EQ = "==";
NEQ = "!=";
AND = 'AND'; DELIM = SP|TAB|CR|LF;
OR = 'OR';
NOT = 'NOT';
LIKE = 'LIKE';
NLIKE = 'NLIKE';
ADDRESS = "Address"; LSB = "[";
TIME = "Time"; RSB = "]";
TIMEDIFF = "TimeDiff"; LCB = "(";
RCB = ")";
COLON = ":";
COMMA = ",";
DOT = ".";
REM = "%";
GT = ">";
GTE = ">=";
LT = "<";
LTE = "<=";
EQ = "==";
NEQ = "!=";
BOOL_LITERAL = 'true'|'false'; AND = 'AND';
FLOAT_LITERAL = "-"? D* "." D+ ("e" "-"? D+)?; OR = 'OR';
INTEGER_LITERAL = INTEGER; NOT = 'NOT';
ID = L(L|D)*; LIKE = 'LIKE';
NLIKE = 'NLIKE';
QU = "\""; ADDRESS = "Address";
SQU = "'"; TIME = "Time";
TIMEDIFF = "TimeDiff";
BOOL_LITERAL = 'true'|'false';
FLOAT_LITERAL = "-"? D* "." D+ ("e" "-"? D+)?;
INTEGER_LITERAL = INTEGER;
ID = L(L|D)*;
QU = "\"";
SQU = "'";
end { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ERR")) return; reset(); break; } end { this._endOfString(); return; }
LSB { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LSB")) return; reset(); break; } LSB { this._foundLexeme("LSB"); this._set_next(); return; }
RSB { if(addLexeme(str, YYLEXSTART, YYCURSOR, "RSB")) return; reset(); break; } RSB { this._foundLexeme("RSB"); this._set_next(); return; }
LCB { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LCB")) return; reset(); break; } LCB { this._foundLexeme("LCB"); this._set_next(); return; }
RCB { if(addLexeme(str, YYLEXSTART, YYCURSOR, "RCB")) return; reset(); break; } RCB { this._foundLexeme("RCB"); this._set_next(); return; }
COLON { if(addLexeme(str, YYLEXSTART, YYCURSOR, "COLON")) return; reset(); break; } COLON { this._foundLexeme("COLON"); this._set_next(); return; }
COMMA { if(addLexeme(str, YYLEXSTART, YYCURSOR, "COMMA")) return; reset(); break; } COMMA { this._foundLexeme("COMMA"); this._set_next(); return; }
DOT { if(addLexeme(str, YYLEXSTART, YYCURSOR, "DOT")) return; reset(); break; } DOT { this._foundLexeme("DOT"); this._set_next(); return; }
REM { if(addLexeme(str, YYLEXSTART, YYCURSOR, "REM")) return; reset(); break; } REM { this._foundLexeme("REM"); this._set_next(); return; }
GT { if(addLexeme(str, YYLEXSTART, YYCURSOR, "GT")) return; reset(); break; } GT { this._foundLexeme("GT"); this._set_next(); return; }
GTE { if(addLexeme(str, YYLEXSTART, YYCURSOR, "GTE")) return; reset(); break; } GTE { this._foundLexeme("GTE"); this._set_next(); return; }
LT { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LT")) return; reset(); break; } LT { this._foundLexeme("LT"); this._set_next(); return; }
LTE { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LTE")) return; reset(); break; } LTE { this._foundLexeme("LTE"); this._set_next(); return; }
EQ { if(addLexeme(str, YYLEXSTART, YYCURSOR, "EQ")) return; reset(); break; } EQ { this._foundLexeme("EQ"); this._set_next(); return; }
NEQ { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NEQ")) return; reset(); break; } NEQ { this._foundLexeme("NEQ"); this._set_next(); return; }
LIKE { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LIKE")) return; reset(); break; } LIKE { this._foundLexeme("LIKE"); this._set_next(); return; }
NLIKE { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NLIKE")) return; reset(); break; } NLIKE { this._foundLexeme("NLIKE"); this._set_next(); return; }
AND { if(addLexeme(str, YYLEXSTART, YYCURSOR, "AND")) return; reset(); break; } AND { this._foundLexeme("AND"); this._set_next(); return; }
OR { if(addLexeme(str, YYLEXSTART, YYCURSOR, "OR")) return; reset(); break; } OR { this._foundLexeme("OR"); this._set_next(); return; }
NOT { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NOT")) return; reset(); break; } NOT { this._foundLexeme("NOT"); this._set_next(); return; }
ADDRESS { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ADDRESS")) return; reset(); break; } ADDRESS { this._foundLexeme("ADDRESS"); this._set_next(); return; }
TIME { if(addLexeme(str, YYLEXSTART, YYCURSOR, "TIME")) return; reset(); break; } TIME { this._foundLexeme("TIME"); this._set_next(); return; }
TIMEDIFF { if(addLexeme(str, YYLEXSTART, YYCURSOR, "TIMEDIFF")) return; reset(); break; } TIMEDIFF { this._foundLexeme("TIMEDIFF"); this._set_next(); return; }
INTEGER_LITERAL { if(addLexeme(str, YYLEXSTART, YYCURSOR, "INTEGER_LITERAL")) return; reset(); break; } INTEGER_LITERAL { this._foundLexeme("INTEGER_LITERAL"); this._set_next(); return; }
FLOAT_LITERAL { if(addLexeme(str, YYLEXSTART, YYCURSOR, "FLOAT_LITERAL")) return; reset(); break; } FLOAT_LITERAL { this._foundLexeme("FLOAT_LITERAL"); this._set_next(); return; }
BOOL_LITERAL { if(addLexeme(str, YYLEXSTART, YYCURSOR, "BOOL_LITERAL")) return; reset(); break; } BOOL_LITERAL { this._foundLexeme("BOOL_LITERAL"); this._set_next(); return; }
ID { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ID")) return; reset(); break; } ID { this._foundLexeme("ID"); this._set_next(); return; }
DELIM { reset(); break; } DELIM { this._set_next(); break; }
QU|SQU { id = 100000000; break;} QU|SQU { id = 100000000; break;}
[^] { if(unknownSymbol(str, YYLEXSTART, YYCURSOR)) return; reset(); break; } [^] { this._unknownSymbol(); this._set_next(); return; }
*/ENDER} */ENDER}
}
}; };
var print_f = function () {
var print_f = function() {
var r_str = ""; var r_str = "";
var next = arguments[0]; var next = arguments[0];
var rx = /(%[a-zA-Z]{1})/; var rx = /(%[a-zA-Z]{1})/;
var a = 1, match; var a = 1,
match;
while (match = rx.exec(next)) { while (match = rx.exec(next)) {
var prev = next.substring(0, match.index); var prev = next.substring(0, match.index);
var macro = next.substring(match.index + 1, match.index + 2); var macro = next.substring(match.index + 1, match.index + 2);
@ -193,17 +258,16 @@ var print_f = function () {
var arg = arguments[a]; var arg = arguments[a];
if(arg !== undefined) { if (arg !== undefined) {
switch (macro) { switch (macro) {
case "s": case "s":
if(arg.to_string && !arg.toString) r_str += arg.to_string(); r_str += arg.toString();
if(arg.toString && !arg.to_string) r_str += arg.toString();
break; break;
case "i": case "i":
r_str += (arg.to_number && arg.to_number()) || parseInt(arg); r_str += parseInt(arg);
break; break;
case "f": case "f":
r_str += (arg.to_number && arg.to_number()) || parseFloat(arg); r_str += parseFloat(arg);
break; break;
} }
} else { } else {
@ -217,41 +281,41 @@ var print_f = function () {
return r_str; return r_str;
}; };
console.log("TEST SINGLE") console.log("TEST SINGLE");
start_search("["); (new Lexer("[")).search();
start_search("]"); (new Lexer("]")).search();
start_search("("); (new Lexer("(")).search();
start_search(")"); (new Lexer(")").search());
start_search(":"); (new Lexer(":")).search();
start_search(","); (new Lexer(",")).search();
start_search("."); (new Lexer(".")).search();
start_search("%"); (new Lexer("%")).search();
start_search(">"); (new Lexer(">")).search();
start_search(">="); (new Lexer(">=")).search();
start_search("<"); (new Lexer("<")).search();
start_search("<="); (new Lexer("<=")).search();
start_search("=="); (new Lexer("==")).search();
start_search("!="); (new Lexer("!=")).search();
start_search("AND"); (new Lexer("AND")).search();
start_search("and"); (new Lexer("and")).search();
start_search("OR"); (new Lexer("OR")).search();
start_search("or"); (new Lexer("or")).search();
start_search("NOT"); (new Lexer("NOT")).search();
start_search("not"); (new Lexer("not")).search();
start_search("LIKE"); (new Lexer("LIKE")).search();
start_search("like"); (new Lexer("like")).search();
start_search("NLIKE"); (new Lexer("NLIKE")).search();
start_search("nlike"); (new Lexer("nlike")).search();
start_search("Address"); (new Lexer("Address")).search();
start_search("Time"); (new Lexer("Time")).search();
start_search("TimeDiff"); (new Lexer("TimeDiff")).search();
console.log("TEST ALL") var lex_test_all = new Lexer("[ ] ( ) : , . % > >= < <= == != AND and OR or NOT not LIKE like NLIKE nlike Address Time TimeDiff 'sdfadfasdf' \"asdfasfd\" ")
start_search("[ ] ( ) : , . % > >= < <= == != AND and OR or NOT not LIKE like NLIKE nlike Address Time TimeDiff "); var _lex;
while(_lex = lex_test_all.next()){
console.log("IN while:", _lex.lexeme);
}
console.log("TEST STRING LITERAL"); console.log("TEST STRING LITERAL");
start_search(' "111\\\"11\\\"1" "222222" '); (new Lexer(' "111\\\"11\\\"1" "222222" ')).search();
start_search(" '111\\\'11\\\'1' '222222' "); (new Lexer(" '111\\\'11\\\'1' '222222' ")).search();
console.log("TEST FAILS");
start_search(' sdfasdfasdfsdf "fasdf');

17
main.js
View File

@ -14,13 +14,12 @@ exec("re2c -i lexer.l", function(err, stdout, stderr) {
var post_process_lexer = function (_string) { var post_process_lexer = function (_string) {
var search_string = fs.readFileSync("search_string.js", "utf8");
// insert last case for string detect // insert last case for string detect
_string = _string.replace(/\}\nENDER}/gm, "yy100000000: { " + search_string + " reset(); break; }}}"); _string = _string.replace(/\}\nENDER}/gm, "yy100000000: { this._searchString(); this._set_next(); return; }}}");
_string = _string.replace(/^.*(_r2c_var_.*;|unsigned int yyaccept = 0;)\n/gm, ""); // replace var yych; _string = _string.replace(/^.*(_r2c_var_.*;|unsigned int yyaccept = 0;)\n/gm, ""); // replace var yych;
_string = _string.replace(/(yych = \*YYCURSOR);\n/gm, "\tcase 1:\n yych = str[YYCURSOR];\n"); // insert "case 1:" before; _string = _string.replace(/(yych = \*this._yy_cursor);\n/gm, "\tcase 1:\n yych = this._string[this._yy_cursor];\n"); // insert "case 1:" before;
_string = _string.replace(/\*(.*?);/gm, "str[$1];"); // замена разыменовываний _string = _string.replace(/\*(.*?);/gm, "this._string[$1];"); // замена разыменовываний
_string = _string.replace(/^yy(\d*?):/gm, "case $1:"); // replace goto marker onto case _string = _string.replace(/^yy(\d*?):/gm, "case $1:"); // replace goto marker onto case
_string = _string.replace(/\) goto yy(\d*?);/gm, ") { id = $1; break; }"); // replace goto inside if _string = _string.replace(/\) goto yy(\d*?);/gm, ") { id = $1; break; }"); // replace goto inside if
_string = _string.replace(/goto yy(\d*?);/gm, "id = $1; break;"); // replace goto outside if _string = _string.replace(/goto yy(\d*?);/gm, "id = $1; break;"); // replace goto outside if
@ -29,10 +28,16 @@ var post_process_lexer = function (_string) {
_string = _string.replace(/0x00/gm, 'undefined'); // replace 0x00 _string = _string.replace(/0x00/gm, 'undefined'); // replace 0x00
// black magic // black magic
_string = _string.replace(/(switch \(yych\) \{[\s\S]*?})/gm, "(function(){$1})(); break;"); // добавим замыкание что бы обработать свиче в свиче _string = _string.replace(/(switch \(yych\) \{[\s\S]*?})/gm, "(function(){$1}.bind(this))(); break;"); // добавим замыкание что бы обработать свиче в свиче
_string = _string.replace(/switch\((id)\)/gm, "switch(this._state)"); // replace id to this._state
_string = _string.replace(/id = (\d.*?);/gm, "this._state = $1;"); // replace id = n to this._state = n
_string = _string.replace(/yyaccept/gm, "this._yy_accept"); // replace yyaccept to this._yy_accept
_string = _string.replace(/yych/gm, "this._yy_char"); // replace yych to this._yy_char
_string = js_beautify(_string, {indent_size: 4, space_in_empty_paren: true}); _string = js_beautify(_string, {indent_size: 4, space_in_empty_paren: true});
fs.writeFileSync("out.js", _string); fs.writeFileSync("lexer.js", _string);
}; };

View File

@ -1,7 +0,0 @@
var info = searchString(str, yych, YYLEXSTART, YYCURSOR);
if(info.success) {
YYCURSOR = info.pos;
addLexeme(str, YYLEXSTART, YYCURSOR, "STRING");
} else {
notFoundCloseQuote(str, YYLEXSTART, YYCURSOR);
}