change readme and add new idea for lexer

2019-01-25 14:25:21 +03:00 · 2019-01-25 14:25:21 +03:00 · 2374e2b5d1
commit 2374e2b5d1
parent 04420a5f9f
4 changed files with 268 additions and 203 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,7 @@
 # re2-js-generator
 ### Install guide
 You need install npm and node v.10;
 npm install
@ -12,8 +13,10 @@ Read re2c manual: http://re2c.org/manual/manual.html
 or
-node parse_source_lexeme.js
+node main.js
 Result fill write in <lexer.js> file.
 Result fill write in out.js file.
 Report for bugs: rolahd@yandex.ru
--- a/lexer.l
+++ b/lexer.l
@ -1,190 +1,255 @@
-var lex = [
+var types = [
-    "ERR",
+    "LSB",
-    "DELIM",
+    "RSB",
    "LCB",
    "RCB",
-    "INTEGER",
+    "COLON",
-
+    "COMMA",
    "DOT",
    "REM",
    "GT",
    "GTE",
    "LT",
    "LTE",
    "EQ",
    "NEQ",
    "LIKE",
    "NLIKE",
    "AND",
    "OR",
    "NOT",
    "ADDRESS",
    "TIME",
    "TIMEDIFF",
    "INTEGER_LITERAL",
    "FLOAT_LITERAL",
    "BOOL_LITERAL",
-    "INTEGER_LITERAL",
+    "ID"
 ];
-var searchString = function (_str, _quote, _yylexstart, _yycursor) {
+var errors = {
-    var found_back_slash = false;
+    "-2": "not found close quote or singleQuote",
-    _yycursor++;
+    "-1": "not found any lexemes or errors or anything else",
-    while(_yycursor < _str.length){
+    "0": "success",
-        var char = _str[_yycursor];
+    "1": "found unknown symbol"
        if(_quote == '"') {
            switch (char) {
                case "\\":
                    found_back_slash = true;
                    break;
                case '"':
                    if(!found_back_slash) {
                        return { success: true, pos: _yycursor + 1 }
                    }
                    found_back_slash = false;
                    break;
            }
        } else if(_quote == "'") {
            switch (char) {
                case "\\":
                    found_back_slash = true;
                    break;
                case "'":
                    if(!found_back_slash) {
                        return { success: true, pos: _yycursor + 1 }
                    }
                    found_back_slash = false;
                    break;
            }
        }
        _yycursor++;
    }
    return {success: false, pos: _yycursor + 1 }
 };
-var addLexeme = function(_str, _yylexstart, _yycursor, _lexeme) {
+var Lexer = function(_string){
-    if(_lexeme !== "ERR"){
+    this._last_found_lexeme = {error: -1};
-        console.log(print_f("found lex: %s; start: %s; end: %s; result => %s", _lexeme, _yylexstart, _yycursor, _str.substring(_yylexstart, _yycursor)));
+    this._end = false;
-    } else {
+    this._error = false;
-      console.log(print_f("search end\n"));
+    this._string = _string;
-      return true;
+    this._state = 1;
-    }
+    this._yy_char = null;
    this._yy_lex_start = 0;
    this._yy_cursor = 0;
    this._yy_marker = 0;
    this._yy_accept = 0;
 };
-var unknownSymbol = function(_str, _yylexstart, _yycursor){
+Lexer.prototype = {
-    throw print_f("Found unknown symbol on position: %s", _yycursor)
+    types: types,
-};
+    errors: errors,
    _notFoundCloseQuote: function() {
         this._error = true;
         this._last_found_lexeme = {
             error: 2,
             start: this._yy_lex_start,
             end: this._yy_cursor
         };
-var notFoundCloseQuote = function(_str, _yylexstart, _yycursor) {
+         console.log( print_f("Not found close quote start: %s", this._yy_cursor));
-    console.log( print_f("Not found close quote start: %s", _yycursor));
+    },
-    throw print_f("Not found close quote start: %s", _yycursor);
+    _unknownSymbol: function(){
-};
+        this._error = true;
-
+        this._last_found_lexeme = {
-var start_search = function(_str) {
+            error: 1,
-        console.log("start search", _str);
+            start: this._yy_lex_start,
-        var id = 1;
+            end: this._yy_cursor
        var yych = null;
        var YYLEXSTART = 0;
        var yyaccept = 0;
        var YYCURSOR = 0;
        var YYMARKER = 0;
        var str = _str;
        var reset = function(){
            yyaccept = 0;
            id = 1;
            YYLEXSTART = YYCURSOR;
            YYMARKER = YYCURSOR;
        };
-while(true)
+        console.log( print_f("Found unknown symbol on position: %s", this._yy_cursor));
-{
+    },
-    switch(id)       /*!re2c
+    _foundLexeme: function(_lexeme) {
-            re2c:define:YYCTYPE = _r2c_var_;
+        console.log(print_f("found lex: %s; start: %s; end: %s; result => %s", _lexeme, this._yy_lex_start, this._yy_cursor, this._string.substring(this._yy_lex_start, this._yy_cursor)));
-            re2c:yyfill:enable = 0;
+        this._last_found_lexeme = {
            error: 0,
            lexeme: _lexeme,
            start: this._yy_lex_start,
            end: this._yy_cursor
        };
      },
      _endOfString: function(){
        console.log(print_f("search end\n"));
        this._end = true;
        this._last_found_lexeme = {
            error: -2
        };
      },
      _searchString: function () {
          var _quote = this._string[this._yy_cursor - 1];
          var found_back_slash = false;
          while(this._yy_cursor < this._string.length){
             this._yy_char = this._string[this._yy_cursor];
             if(_quote == '"') {
                 switch (this._yy_char) {
                     case "\\":
                         found_back_slash = true;
                         break;
                     case '"':
                         if(!found_back_slash) {
                             this._yy_cursor++;
                             this._foundLexeme("STRING_LITERAL");
                             return;
                         }
                         found_back_slash = false;
                         break;
                 }
             } else if(_quote == "'") {
                 switch (this._yy_char) {
                     case "\\":
                         found_back_slash = true;
                         break;
                     case "'":
                         if(!found_back_slash) {
                             this._yy_cursor++;
                             this._foundLexeme("STRING_LITERAL");
                             return;
                         }
                         found_back_slash = false;
                         break;
                 }
             }
             this._yy_cursor++;
         }
-            D                           = [0-9];
+         this._notFoundCloseQuote();
-            end                         = "\x00";
+     },
-            L                           = [A-Za-z_];
+    _set_next: function(){
-            RL                          = [\U00000400-\U00000451];
+        this._yy_accept = 0;
        this._state = 1;
        this._yy_lex_start = this._yy_cursor;
        this._yy_marker = this._yy_cursor;
    },
    next: function(){
        if(this._end || this._error) return null;
        this.search();
        return this.token();
    },
    token: function(){
        return this._last_found_lexeme;
    },
    search: function(){
        if(this._end) return false;
-            CR                          = "\r";
+        while(true){
-            LF                          = "\n";
+        switch(id)       /*!re2c
-            CRLF                        = CR?LF;
+                            re2c:define:YYCTYPE = _r2c_var_;
-            INTEGER                     = "-"?D+;
+                            re2c:define:YYCURSOR = this._yy_cursor;
-            SP                          = " ";
+                            re2c:define:YYMARKER = this._yy_marker;
-            TAB                         = "\t";
+                            re2c:yyfill:enable = 0;
-            DELIM                       = SP|TAB|CR|LF;
+                            D                           = [0-9];
                            end                         = "\x00";
                            L                           = [A-Za-z_];
                            RL                          = [\U00000400-\U00000451];
-            LSB                         = "[";
+                            CR                          = "\r";
-            RSB                         = "]";
+                            LF                          = "\n";
-            LCB                         = "(";
+                            CRLF                        = CR?LF;
-            RCB                         = ")";
+                            INTEGER                     = "-"?D+;
-            COLON                       = ":";
+                            SP                          = " ";
-            COMMA                       = ",";
+                            TAB                         = "\t";
            DOT                         = ".";
            REM                         = "%";
            GT                          = ">";
            GTE                         = ">=";
            LT                          = "<";
            LTE                         = "<=";
            EQ                          = "==";
            NEQ                         = "!=";
-            AND                         = 'AND';
+                            DELIM                       = SP|TAB|CR|LF;
            OR                          = 'OR';
            NOT                         = 'NOT';
            LIKE                        = 'LIKE';
            NLIKE                       = 'NLIKE';
-            ADDRESS                     = "Address";
+                            LSB                         = "[";
-            TIME                        = "Time";
+                            RSB                         = "]";
-            TIMEDIFF                    = "TimeDiff";
+                            LCB                         = "(";
                            RCB                         = ")";
                            COLON                       = ":";
                            COMMA                       = ",";
                            DOT                         = ".";
                            REM                         = "%";
                            GT                          = ">";
                            GTE                         = ">=";
                            LT                          = "<";
                            LTE                         = "<=";
                            EQ                          = "==";
                            NEQ                         = "!=";
-            BOOL_LITERAL                = 'true'|'false';
+                            AND                         = 'AND';
-            FLOAT_LITERAL               = "-"? D* "." D+ ("e" "-"? D+)?;
+                            OR                          = 'OR';
-            INTEGER_LITERAL             = INTEGER;
+                            NOT                         = 'NOT';
-            ID                          = L(L|D)*;
+                            LIKE                        = 'LIKE';
                            NLIKE                       = 'NLIKE';
-            QU = "\"";
+                            ADDRESS                     = "Address";
-            SQU = "'";
+                            TIME                        = "Time";
                            TIMEDIFF                    = "TimeDiff";
                            BOOL_LITERAL                = 'true'|'false';
                            FLOAT_LITERAL               = "-"? D* "." D+ ("e" "-"? D+)?;
                            INTEGER_LITERAL             = INTEGER;
                            ID                          = L(L|D)*;
                            QU = "\"";
                            SQU = "'";
-            end                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ERR")) return; reset(); break; }
+                            end                         { this._endOfString(); return; }
-            LSB                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LSB")) return; reset(); break; }
+                            LSB                         { this._foundLexeme("LSB"); this._set_next(); return; }
-            RSB                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "RSB")) return; reset(); break; }
+                            RSB                         { this._foundLexeme("RSB"); this._set_next(); return; }
-            LCB                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LCB")) return; reset(); break; }
+                            LCB                         { this._foundLexeme("LCB"); this._set_next(); return; }
-            RCB                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "RCB")) return; reset(); break; }
+                            RCB                         { this._foundLexeme("RCB"); this._set_next(); return; }
-            COLON                       { if(addLexeme(str, YYLEXSTART, YYCURSOR, "COLON")) return; reset(); break; }
+                            COLON                       { this._foundLexeme("COLON"); this._set_next(); return; }
-            COMMA                       { if(addLexeme(str, YYLEXSTART, YYCURSOR, "COMMA")) return; reset(); break; }
+                            COMMA                       { this._foundLexeme("COMMA"); this._set_next(); return; }
-            DOT                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "DOT")) return; reset(); break; }
+                            DOT                         { this._foundLexeme("DOT"); this._set_next(); return; }
-            REM                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "REM")) return; reset(); break; }
+                            REM                         { this._foundLexeme("REM"); this._set_next(); return; }
-            GT                          { if(addLexeme(str, YYLEXSTART, YYCURSOR, "GT")) return; reset(); break; }
+                            GT                          { this._foundLexeme("GT"); this._set_next(); return; }
-            GTE                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "GTE")) return; reset(); break; }
+                            GTE                         { this._foundLexeme("GTE"); this._set_next(); return; }
-            LT                          { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LT")) return; reset(); break; }
+                            LT                          { this._foundLexeme("LT"); this._set_next(); return; }
-            LTE                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LTE")) return; reset(); break; }
+                            LTE                         { this._foundLexeme("LTE"); this._set_next(); return; }
-            EQ                          { if(addLexeme(str, YYLEXSTART, YYCURSOR, "EQ")) return; reset(); break; }
+                            EQ                          { this._foundLexeme("EQ"); this._set_next(); return; }
-            NEQ                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NEQ")) return; reset(); break; }
+                            NEQ                         { this._foundLexeme("NEQ"); this._set_next(); return; }
-            LIKE                        { if(addLexeme(str, YYLEXSTART, YYCURSOR, "LIKE")) return; reset(); break; }
+                            LIKE                        { this._foundLexeme("LIKE"); this._set_next(); return; }
-            NLIKE                       { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NLIKE")) return; reset(); break; }
+                            NLIKE                       { this._foundLexeme("NLIKE"); this._set_next(); return; }
-            AND                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "AND")) return; reset(); break; }
+                            AND                         { this._foundLexeme("AND"); this._set_next(); return; }
-            OR                          { if(addLexeme(str, YYLEXSTART, YYCURSOR, "OR")) return; reset(); break; }
+                            OR                          { this._foundLexeme("OR"); this._set_next(); return; }
-            NOT                         { if(addLexeme(str, YYLEXSTART, YYCURSOR, "NOT")) return; reset(); break; }
+                            NOT                         { this._foundLexeme("NOT"); this._set_next(); return; }
-            ADDRESS                     { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ADDRESS")) return; reset(); break; }
+                            ADDRESS                     { this._foundLexeme("ADDRESS"); this._set_next(); return; }
-            TIME                        { if(addLexeme(str, YYLEXSTART, YYCURSOR, "TIME")) return; reset(); break; }
+                            TIME                        { this._foundLexeme("TIME"); this._set_next(); return; }
-            TIMEDIFF                    { if(addLexeme(str, YYLEXSTART, YYCURSOR, "TIMEDIFF")) return; reset(); break; }
+                            TIMEDIFF                    { this._foundLexeme("TIMEDIFF"); this._set_next(); return; }
-            INTEGER_LITERAL             { if(addLexeme(str, YYLEXSTART, YYCURSOR, "INTEGER_LITERAL")) return; reset(); break; }
+                            INTEGER_LITERAL             { this._foundLexeme("INTEGER_LITERAL"); this._set_next(); return; }
-            FLOAT_LITERAL               { if(addLexeme(str, YYLEXSTART, YYCURSOR, "FLOAT_LITERAL")) return; reset(); break; }
+                            FLOAT_LITERAL               { this._foundLexeme("FLOAT_LITERAL"); this._set_next(); return; }
-            BOOL_LITERAL                { if(addLexeme(str, YYLEXSTART, YYCURSOR, "BOOL_LITERAL")) return; reset(); break; }
+                            BOOL_LITERAL                { this._foundLexeme("BOOL_LITERAL"); this._set_next(); return; }
-            ID                          { if(addLexeme(str, YYLEXSTART, YYCURSOR, "ID")) return; reset(); break; }
+                            ID                          { this._foundLexeme("ID"); this._set_next(); return; }
-            DELIM                       { reset(); break;  }
+                            DELIM                       { this._set_next(); break;  }
-            QU|SQU                      { id = 100000000; break;}
+                            QU|SQU                      { id = 100000000; break;}
-            [^]                         { if(unknownSymbol(str, YYLEXSTART, YYCURSOR)) return; reset(); break;  }
+                            [^]                         { this._unknownSymbol(); this._set_next(); return;  }
-        */ENDER}
+                        */ENDER}
    }
 };
-var print_f = function () {
+
 var print_f = function() {
    var r_str = "";
    var next = arguments[0];
    var rx = /(%[a-zA-Z]{1})/;
-    var a = 1, match;
+    var a = 1,
        match;
    while (match = rx.exec(next)) {
        var prev = next.substring(0, match.index);
        var macro = next.substring(match.index + 1, match.index + 2);
@ -193,17 +258,16 @@ var print_f = function () {
        var arg = arguments[a];
-        if(arg !== undefined) {
+        if (arg !== undefined) {
            switch (macro) {
                case "s":
-                    if(arg.to_string && !arg.toString) r_str += arg.to_string();
+                    r_str += arg.toString();
                    if(arg.toString && !arg.to_string) r_str += arg.toString();
                    break;
                case "i":
-                    r_str += (arg.to_number && arg.to_number()) || parseInt(arg);
+                    r_str += parseInt(arg);
                    break;
                case "f":
-                    r_str += (arg.to_number && arg.to_number()) || parseFloat(arg);
+                    r_str += parseFloat(arg);
                    break;
            }
        } else {
@ -217,41 +281,41 @@ var print_f = function () {
    return r_str;
 };
-console.log("TEST SINGLE")
+console.log("TEST SINGLE");
-start_search("[");
+(new Lexer("[")).search();
-start_search("]");
+(new Lexer("]")).search();
-start_search("(");
+(new Lexer("(")).search();
-start_search(")");
+(new Lexer(")").search());
-start_search(":");
+(new Lexer(":")).search();
-start_search(",");
+(new Lexer(",")).search();
-start_search(".");
+(new Lexer(".")).search();
-start_search("%");
+(new Lexer("%")).search();
-start_search(">");
+(new Lexer(">")).search();
-start_search(">=");
+(new Lexer(">=")).search();
-start_search("<");
+(new Lexer("<")).search();
-start_search("<=");
+(new Lexer("<=")).search();
-start_search("==");
+(new Lexer("==")).search();
-start_search("!=");
+(new Lexer("!=")).search();
-start_search("AND");
+(new Lexer("AND")).search();
-start_search("and");
+(new Lexer("and")).search();
-start_search("OR");
+(new Lexer("OR")).search();
-start_search("or");
+(new Lexer("or")).search();
-start_search("NOT");
+(new Lexer("NOT")).search();
-start_search("not");
+(new Lexer("not")).search();
-start_search("LIKE");
+(new Lexer("LIKE")).search();
-start_search("like");
+(new Lexer("like")).search();
-start_search("NLIKE");
+(new Lexer("NLIKE")).search();
-start_search("nlike");
+(new Lexer("nlike")).search();
-start_search("Address");
+(new Lexer("Address")).search();
-start_search("Time");
+(new Lexer("Time")).search();
-start_search("TimeDiff");
+(new Lexer("TimeDiff")).search();
-console.log("TEST ALL")
+var lex_test_all = new Lexer("[  ]  (  )  :  ,  .  %  >  >=  <  <=  ==  !=  AND  and  OR  or  NOT  not  LIKE  like  NLIKE  nlike  Address  Time  TimeDiff 'sdfadfasdf' \"asdfasfd\" ")
-start_search("[  ]  (  )  :  ,  .  %  >  >=  <  <=  ==  !=  AND  and  OR  or  NOT  not  LIKE  like  NLIKE  nlike  Address  Time  TimeDiff  ");
+var _lex;
 while(_lex = lex_test_all.next()){
    console.log("IN while:", _lex.lexeme);
 }
 console.log("TEST STRING LITERAL");
-start_search('  "111\\\"11\\\"1" "222222" ');
+(new Lexer('  "111\\\"11\\\"1" "222222" ')).search();
-start_search("  '111\\\'11\\\'1' '222222' ");
+(new Lexer("  '111\\\'11\\\'1' '222222' ")).search();
 console.log("TEST FAILS");
 start_search('  sdfasdfasdfsdf "fasdf');
--- a/main.js
+++ b/main.js
@ -14,13 +14,12 @@ exec("re2c -i lexer.l", function(err, stdout, stderr) {
 var post_process_lexer = function (_string) {
    var search_string = fs.readFileSync("search_string.js", "utf8");
    // insert last case for string detect
-    _string = _string.replace(/\}\nENDER}/gm, "yy100000000: { " + search_string + " reset(); break; }}}");
+    _string = _string.replace(/\}\nENDER}/gm, "yy100000000: { this._searchString(); this._set_next(); return; }}}");
    _string = _string.replace(/^.*(_r2c_var_.*;|unsigned int yyaccept = 0;)\n/gm, ""); // replace var yych;
-    _string = _string.replace(/(yych = \*YYCURSOR);\n/gm, "\tcase 1:\n yych = str[YYCURSOR];\n"); // insert "case 1:" before;
+    _string = _string.replace(/(yych = \*this._yy_cursor);\n/gm, "\tcase 1:\n yych = this._string[this._yy_cursor];\n"); // insert "case 1:" before;
-    _string = _string.replace(/\*(.*?);/gm, "str[$1];");  // замена разыменовываний
+    _string = _string.replace(/\*(.*?);/gm, "this._string[$1];");  // замена разыменовываний
    _string = _string.replace(/^yy(\d*?):/gm, "case $1:"); // replace goto marker onto case
    _string = _string.replace(/\) goto yy(\d*?);/gm, ") { id = $1; break; }"); // replace goto inside if
    _string = _string.replace(/goto yy(\d*?);/gm, "id = $1; break;"); // replace goto outside if
@ -29,10 +28,16 @@ var post_process_lexer = function (_string) {
    _string = _string.replace(/0x00/gm, 'undefined'); // replace 0x00
    // black magic
-    _string = _string.replace(/(switch \(yych\) \{[\s\S]*?})/gm, "(function(){$1})(); break;"); // добавим замыкание что бы обработать свиче в свиче
+    _string = _string.replace(/(switch \(yych\) \{[\s\S]*?})/gm, "(function(){$1}.bind(this))(); break;"); // добавим замыкание что бы обработать свиче в свиче
    _string = _string.replace(/switch\((id)\)/gm, "switch(this._state)"); // replace id to this._state
    _string = _string.replace(/id = (\d.*?);/gm, "this._state = $1;"); // replace id = n to this._state = n
    _string = _string.replace(/yyaccept/gm, "this._yy_accept"); // replace yyaccept to this._yy_accept
    _string = _string.replace(/yych/gm, "this._yy_char"); // replace yych to this._yy_char
    _string = js_beautify(_string, {indent_size: 4, space_in_empty_paren: true});
-    fs.writeFileSync("out.js", _string);
+    fs.writeFileSync("lexer.js", _string);
 };
--- a/search_string.js
+++ b/search_string.js
@ -1,7 +0,0 @@
 var info = searchString(str, yych, YYLEXSTART, YYCURSOR);
 if(info.success) {
    YYCURSOR = info.pos;
    addLexeme(str, YYLEXSTART, YYCURSOR, "STRING");
 } else {
    notFoundCloseQuote(str, YYLEXSTART, YYCURSOR);
 }