Home

Awesome

lex-parser [OBSOLETED]

Join the chat at https://gitter.im/jison-parsers-lexers/Lobby Build Status NPM version Dependency Status npm

A parser for lexical grammars used by jison and jison-lex.

deprecation notice

From today (2017/oct/15) the lex-parser repository is obsoleted for the lex-parser package/codebase: the primary source is the jison monorepo's packages/lex-parser/ directory. See also https://github.com/GerHobbelt/jison/issues/16.

(For a comparable argument, see also "Why is Babel a monorepo?")

Issues, pull requests, etc. for lex-parser should be filed there; hence we do not accept issue reports in this secondary repository any more.

This repository will track the primary source for a while still, but be very aware that this particular repository will always be lagging behind!

install

npm install @gerhobbelt/lex-parser

build

To build the parser yourself, follow the install & build directions of the monorepo.

Note about ES6/rollup usage vs. ES5

All dist/ library files are 'self-contained': they include all 'local imports' from within this jison monorepo in order to deliver a choice of source files for your perusal where you only need to worry about importing external dependencies (such as recast).

As such, these dist/ files should be easier to minify and/or use in older (ES5) environments.

rollup

Iff you use rollup or similar tools in an ES6/ES2015/ES2017 setting, then the package.json::module has already been set up for you to use the original sources instead!

usage

var lexParser = require("@gerhobbelt/lex-parser");

// parse a lexical grammar and return JSON
lexParser.parse("%% ... ");

example

The parser can parse its own lexical grammar, shown below:

%code imports %{
  import helpers from 'jison-helpers-lib';
%}



ASCII_LETTER                            [a-zA-z]
// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge
// with {UNICODE_LETTER} (though jison has code to optimize if you *did*
// include the `[a-zA-Z]` anyway):
UNICODE_LETTER                          [\p{Alphabetic}]
ALPHA                                   [{UNICODE_LETTER}_]
DIGIT                                   [\p{Number}]
WHITESPACE                              [\s\r\n\p{Separator}]
ALNUM                                   [{ALPHA}{DIGIT}]

NAME                                    [{ALPHA}](?:[{ALNUM}-]*{ALNUM})?
ID                                      [{ALPHA}]{ALNUM}*
DECIMAL_NUMBER                          [1-9][0-9]*
HEX_NUMBER                              "0"[xX][0-9a-fA-F]+
BR                                      \r\n|\n|\r
// WhiteSpace MUST NOT match CR/LF and the regex `\s` DOES, so we cannot use
// that one directly. Instead we define the {WS} macro here:
WS                                      [^\S\r\n]

// Quoted string content: support *escaped* quotes inside strings:
QUOTED_STRING_CONTENT                   (?:\\\'|\\[^\']|[^\\\'\r\n])*
DOUBLEQUOTED_STRING_CONTENT             (?:\\\"|\\[^\"]|[^\\\"\r\n])*
// backquoted ES6/ES2017 string templates MAY span multiple lines:
ES2017_STRING_CONTENT                   (?:\\\`|\\[^\`]|[^\\\`])*

// Accept any non-regex-special character as a direct literal without
// the need to put quotes around it:
ANY_LITERAL_CHAR                        [^\s\r\n<>\[\](){}.*+?:!=|%\/\\^$,\'\";]


%s rules macro named_chunk
%x code start_condition options conditions action path set


// Off Topic
// ---------
//
// Do NOT specify the xregexp option as we want the XRegExp \p{...} regex macros converted to 
// native regexes and used as such:
//
// %options xregexp

%options easy_keyword_rules
%options ranges
%options xregexp



%%

"%{"                                    yy.dept = 0;
                                        yy.include_command_allowed = false;
                                        this.pushState('action'); 
                                        this.unput(yytext);
                                        yytext = '';
                                        return 'ACTION_START';
<action>"%{"([^]*?)"%}"                 yytext = this.matches[1]; 
                                        yy.include_command_allowed = true;
                                        return 'ACTION';
<action>"%include"                      %{
                                            if (yy.include_command_allowed) {
                                                // This is an include instruction in place of an action:
                                                //
                                                // - one %include per action chunk
                                                // - one %include replaces an entire action chunk
                                                this.pushState('path');
                                                return 'INCLUDE';
                                            } else {
                                                // TODO
                                                yyerror('oops!');
                                                return 'INCLUDE_PLACEMENT_ERROR';
                                            }
                                        %}
<action>{WS}*"/*"[^]*?"*/"              //yy.include_command_allowed = false; -- doesn't impact include-allowed state 
                                        return 'ACTION_BODY_C_COMMENT';
<action>{WS}*"//".*                     yy.include_command_allowed = false; 
                                        return 'ACTION_BODY_CPP_COMMENT';
<action>{WS}+                           return 'ACTION_BODY_WHITESPACE';

// make sure to terminate on linefeed before the next rule alternative,
// which is announced by `|`:                                        
<action>"|"                             if (yy.include_command_allowed) {
                                            this.popState();
                                            this.unput(yytext);
                                            yytext = '';
                                            return 'ACTION_END';
                                        } else {
                                            return 'ACTION_BODY';    
                                        }

// make sure to terminate on linefeed before the rule section ends,
// which is announced by `%%`:                                        
<action>"%%"                            if (yy.include_command_allowed) {
                                            this.popState();
                                            this.unput(yytext);
                                            yytext = '';
                                            return 'ACTION_END';
                                        } else {
                                            return 'ACTION_BODY';    
                                        }

<action>"%"                             return 'ACTION_BODY';    

// regexp with braces or quotes (and no spaces, so we don't mistake
// a *division operator* `/` for a regex delimiter here in most circumstances):
<action>"/"[^\s/]*?(?:['"`{}][^\s/]*?)*"/"
                                        yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
// hack to cope with slashes which MAY be divide operators OR are regex starters:
// we simply gobble the entire line until the end or until we hit a closing brace,
// as we MUST keep track of the curly brace pairs inside an action body.
<action>"/"[^}{BR}]*
                                        yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>\"{DOUBLEQUOTED_STRING_CONTENT}\"
                                        yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>\'{QUOTED_STRING_CONTENT}\'     yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>\`{ES2017_STRING_CONTENT}\`     yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>[^{}/"'`|%\{\}{BR}{WS}]+        yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>"{"                             yy.depth++; 
                                        yy.include_command_allowed = false; 
                                        return 'ACTION_BODY';
<action>"}"                             %{
                                            yy.include_command_allowed = false; 
                                            if (yy.depth <= 0) {
                                                yyerror(rmCommonWS`
                                                    too many closing curly braces in lexer rule action block.

                                                    Note: the action code chunk may be too complex for jison to parse
                                                    easily; we suggest you wrap the action code chunk in '%{...%\}'
                                                    to help jison grok more or less complex action code chunks.

                                                      Erroneous area:
                                                    ` + this.prettyPrintRange(this, yylloc));
                                                return 'BRACKETS_SURPLUS';
                                            } else {
                                                yy.depth--;
                                            }
                                            return 'ACTION_BODY';
                                        %}
// make sure to terminate on linefeed before the next rule alternative,
// which is announced by `|`.
// Note that lexer options & commands should be at the start-of-line, i.e.
// without leading whitespace. The only lexer command which we do accept
// here after the last indent is `%include`, which is considered (part
// of) the rule's action code block.
<action>(?:{BR}{WS}+)+/[^{WS}{BR}|]     yy.include_command_allowed = true; 
                                        return 'ACTION_BODY_WHITESPACE';           // keep empty lines as-is inside action code blocks.
<action>{BR}                            if (yy.depth > 0) {
                                            yy.include_command_allowed = true; 
                                            return 'ACTION_BODY_WHITESPACE';       // keep empty lines as-is inside action code blocks.
                                        } else {
                                            // end of action code chunk
                                            this.popState();
                                            this.unput(yytext);
                                            yytext = '';
                                            return 'ACTION_END';
                                        }
<action><<EOF>>                         %{
                                            yy.include_command_allowed = false; 
                                            if (yy.depth !== 0) {
                                                yyerror(rmCommonWS`
                                                    missing ${yy.depth} closing curly braces in lexer rule action block.

                                                    Note: the action code chunk may be too complex for jison to parse
                                                    easily; we suggest you wrap the action code chunk in '%{...%\}'
                                                    to help jison grok more or less complex action code chunks.

                                                      Erroneous area:
                                                    ` + this.prettyPrintRange(this, yylloc));
                                                yytext = '';
                                                return 'BRACKETS_MISSING';
                                            }
                                            this.popState();
                                            yytext = '';
                                            return 'ACTION_END';
                                        %}

<conditions>{NAME}                      return 'NAME';
<conditions>">"                         this.popState(); return '>';
<conditions>","                         return ',';
<conditions>"*"                         return '*';

// Comments should be gobbled and discarded anywhere
// *except* the code/action blocks:
<INITIAL,start_condition,macro,path,options>{WS}*"//"[^\r\n]*
                                        /* skip single-line comment */
<INITIAL,start_condition,macro,path,options>{WS}*"/*"[^]*?"*/"
                                        /* skip multi-line comment */

<rules>{BR}+                            /* empty */
<rules>{WS}+{BR}+                       /* empty */
<rules>"//"[^\r\n]*
                                        /* skip single-line comment */
<rules>"/*"[^]*?"*/"
                                        /* skip multi-line comment */
// ACTION code chunks follow rules and are generally indented, but
// never start with characters special to the lex language itself:
// - `%` can start options, commands, etc., e.g. `%include` or `%options`
// - `|` starts a rule alternative, never a chunk of action code.
// -  					
<rules>{WS}+/[^{WS}{BR}|%]              yy.depth = 0; 
                                        yy.include_command_allowed = true; 
                                        this.pushState('action'); 
                                        return 'ACTION_START';
<rules>"%%"                             this.popState(); 
                                        this.pushState('code'); 
                                        return '%%';
// Accept any non-regex-special character as a direct literal without
// the need to put quotes around it:
<rules>{ANY_LITERAL_CHAR}+
                                        %{
                                            // accept any non-regex, non-lex, non-string-delim,
                                            // non-escape-starter, non-space character as-is
                                            return 'CHARACTER_LIT';
                                        %}
<options>{NAME}                         return 'NAME';
<options>"="                            return '=';
<options>\"{DOUBLEQUOTED_STRING_CONTENT}\"
                                        yytext = unescQuote(this.matches[1], /\\"/g); return 'OPTION_STRING_VALUE';   // value is always a string type
<options>\'{QUOTED_STRING_CONTENT}\'
                                        yytext = unescQuote(this.matches[1], /\\'/g); return 'OPTION_STRING_VALUE';   // value is always a string type
<options>\`{ES2017_STRING_CONTENT}\`
                                        yytext = unescQuote(this.matches[1], /\\`/g); return 'OPTION_STRING_VALUE';   // value is always a string type

<options>[^\s\r\n]+                     return 'OPTION_VALUE';
<options>{BR}{WS}+(?=\S)                /* skip leading whitespace on the next line of input, when followed by more options */
<options>{BR}                           this.popState(); return 'OPTIONS_END';
<options>{WS}+                          /* skip whitespace */

<start_condition>{ID}                   return 'START_COND';
<start_condition>{BR}+                  this.popState();
<start_condition>{WS}+                  /* empty */

<named_chunk>{ID}                       return 'NAME';
<INITIAL>{ID}                           this.pushState('macro'); return 'NAME';
<macro,named_chunk>{BR}+                this.popState();

// Accept any non-regex-special character as a direct literal without
// the need to put quotes around it:
<macro>{ANY_LITERAL_CHAR}+              %{
                                            // accept any non-regex, non-lex, non-string-delim,
                                            // non-escape-starter, non-space character as-is
                                            return 'CHARACTER_LIT';
                                        %}

{BR}+                                   /* empty */
\s+                                     /* empty */

\"{DOUBLEQUOTED_STRING_CONTENT}\"       %{
                                            yytext = unescQuote(this.matches[1], /\\"/g);
                                            return 'STRING_LIT';
                                        %}
\'{QUOTED_STRING_CONTENT}\'             %{
                                            yytext = unescQuote(this.matches[1], /\\'/g);
                                            return 'STRING_LIT';
                                        %}
"["                                     this.pushState('set'); return 'REGEX_SET_START';
"|"                                     return '|';
"(?:"                                   return 'SPECIAL_GROUP';
"(?="                                   return 'SPECIAL_GROUP';
"(?!"                                   return 'SPECIAL_GROUP';
"("                                     return '(';
")"                                     return ')';
"+"                                     return '+';
"*"                                     return '*';
"?"                                     return '?';
"^"                                     return '^';
","                                     return ',';
"<<EOF>>"                               return '$';
"<"                                     this.pushState('conditions'); return '<';
"/!"                                    return '/!';                    // treated as `(?!atom)`
"/"                                     return '/';                     // treated as `(?=atom)`
"\\"([0-7]{1,3}|[rfntvsSbBwWdD\\*+()${}|[\]\/.^?]|"c"[A-Z]|"x"[0-9A-F]{2}|"u"[a-fA-F0-9]{4})
                                        return 'ESCAPE_CHAR';
"\\".                                   yytext = yytext.replace(/^\\/g, ''); return 'ESCAPE_CHAR';
"$"                                     return '$';
"."                                     return '.';
"%options"                              this.pushState('options'); return 'OPTIONS';
"%s"                                    this.pushState('start_condition'); return 'START_INC';
"%x"                                    this.pushState('start_condition'); return 'START_EXC';

"%code"                                 this.pushState('named_chunk'); return 'INIT_CODE';
"%import"                               this.pushState('named_chunk'); return 'IMPORT';

"%include"                              yy.depth = 0;
                                        yy.include_command_allowed = true; 
                                        this.pushState('action'); 
                                        this.unput(yytext);
                                        yytext = '';
                                        return 'ACTION_START';

<code>"%include"                        this.pushState('path'); 
                                        return 'INCLUDE';

<INITIAL,rules,code>"%"{NAME}([^\r\n]*)
                                        %{
                                            /* ignore unrecognized decl */
                                            this.warn(rmCommonWS`
                                                LEX: ignoring unsupported lexer option ${dquote(yytext)}
                                                while lexing in ${dquote(this.topState())} state.

                                                  Erroneous area:
                                                ` + this.prettyPrintRange(this, yylloc));
                                            yytext = [
                                                this.matches[1],            // {NAME}
                                                this.matches[2].trim()      // optional value/parameters
                                            ];
                                            return 'UNKNOWN_DECL';
                                        %}
"%%"                                    this.pushState('rules');
                                        return '%%';
"{"\d+(","\s*\d+|",")?"}"               return 'RANGE_REGEX';
"{"{ID}"}"                              return 'NAME_BRACE';
<set,options>"{"{ID}"}"                 return 'NAME_BRACE';
"{"                                     return '{';
"}"                                     return '}';


<set>(?:"\\\\"|"\\]"|[^\]{])+           return 'REGEX_SET';
<set>"{"                                return 'REGEX_SET';
<set>"]"                                this.popState();
                                        return 'REGEX_SET_END';


// in the trailing CODE block, only accept these `%include` macros when
// they appear at the start of a line and make sure the rest of lexer
// regexes account for this one so it'll match that way only:
<code>[^\r\n]*(\r|\n)+                  return 'CODE';
<code>[^\r\n]+                          return 'CODE';      // the bit of CODE just before EOF...


<path>{BR}                              this.popState(); this.unput(yytext);

<path>\"{DOUBLEQUOTED_STRING_CONTENT}\"
                                        yytext = unescQuote(this.matches[1]);
                                        this.popState();
                                        return 'PATH';
<path>\'{QUOTED_STRING_CONTENT}\'
                                        yytext = unescQuote(this.matches[1]);
                                        this.popState();
                                        return 'PATH';

<path>{WS}+                             // skip whitespace in the line
<path>[^\s\r\n]+                        this.popState();
                                        return 'PATH';


// detect and report unterminated string constants ASAP
// for 'action', 'options', but also for other lexer conditions:
//
// these error catching rules fix https://github.com/GerHobbelt/jison/issues/13
<action>\"                              yyerror(rmCommonWS`
                                            unterminated string constant in lexer rule action block.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<action>\'                              yyerror(rmCommonWS`
                                            unterminated string constant in lexer rule action block.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<action>\`                              yyerror(rmCommonWS`
                                            unterminated string constant in lexer rule action block.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';

<options>\"                             yyerror(rmCommonWS`
                                            unterminated string constant in %options entry.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<options>\'                             yyerror(rmCommonWS`
                                            unterminated string constant in %options entry.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<options>\`                             yyerror(rmCommonWS`
                                            unterminated string constant in %options entry.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';

<*>\"                                   var rules = (this.topState() === 'macro' ? 'macro\'s' : this.topState());
                                        yyerror(rmCommonWS`
                                            unterminated string constant  encountered while lexing
                                            ${rules}.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<*>\'                                   var rules = (this.topState() === 'macro' ? 'macro\'s' : this.topState());
                                        yyerror(rmCommonWS`
                                            unterminated string constant  encountered while lexing
                                            ${rules}.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';
<*>\`                                   var rules = (this.topState() === 'macro' ? 'macro\'s' : this.topState());
                                        yyerror(rmCommonWS`
                                            unterminated string constant  encountered while lexing
                                            ${rules}.

                                              Erroneous area:
                                            ` + this.prettyPrintRange(this, yylloc));
                                        return 'error';


<macro,rules>.                          %{
                                            /* b0rk on bad characters */
                                            var rules = (this.topState() === 'macro' ? 'macro\'s' : this.topState());
                                            yyerror(rmCommonWS`
                                                unsupported lexer input encountered while lexing
                                                ${rules} (i.e. jison lex regexes).

                                                    NOTE: When you want this input to be interpreted as a LITERAL part
                                                          of a lex rule regex, you MUST enclose it in double or
                                                          single quotes.

                                                          If not, then know that this input is not accepted as a valid
                                                          regex expression here in jison-lex ${rules}.

                                                  Erroneous area:
                                                ` + this.prettyPrintRange(this, yylloc));
                                        %}

<*>.                                    %{
                                            yyerror(rmCommonWS`
                                                unsupported lexer input: ${dquote(yytext)} 
						while lexing in ${dquote(this.topState())} state.

                                                  Erroneous area:
                                                ` + this.prettyPrintRange(this, yylloc));
                                        %}

<*><<EOF>>                              return 'EOF';

%%


var rmCommonWS = helpers.rmCommonWS;
var dquote     = helpers.dquote;


function indent(s, i) {
    var a = s.split('\n');
    var pf = (new Array(i + 1)).join(' ');
    return pf + a.join('\n' + pf);
}

// unescape a string value which is wrapped in quotes/doublequotes
function unescQuote(str) {
    str = '' + str;
    var a = str.split('\\\\');
    a = a.map(function (s) {
        return s.replace(/\\'/g, "'").replace(/\\"/g, '"');
    });
    str = a.join('\\\\');
    return str;
}

license

MIT

related repositories