stub parser

2020-03-27 15:55:45 -07:00 · 2020-03-27 15:55:45 -07:00 · aa19d42a11
commit aa19d42a11
parent 7c75be543f
9 changed files with 205 additions and 39 deletions
--- a/package.json
+++ b/package.json
@ -17,7 +17,8 @@
  },
  "scripts": {
    "start": "react-scripts start",
-    "compile-grammar": "nearleyc src/utils/grammar.ne -o src/utils/grammar.js",
+    "compile-grammar": "nearleyc src/utils/latl/grammar.ne -o src/utils/latl/grammar.js",
+    "test-grammar": "nearley-test src/utils/latl/grammar.js --input",
    "flow": "flow",
    "build": "react-scripts build",
    "test": "react-scripts test",
--- a/src/utils/grammar.js
+++ b/src/utils/grammar.js
--- a/src/utils/latl/grammar.js
+++ b/src/utils/latl/grammar.js
@ -3,17 +3,50 @@
 (function () {
 function id(x) { return x[0]; }

-  const lexer = require('./lexer');
+  const { lexer } = require('./lexer.js');
+  const getTerminal = d => d ? d[0] : null;
+  const getAll = d => d.map((item, i) => ({[i]: item}));
+  const flag = token => d => d.map(item => ({[token]: item}))
+  const clearNull = d => d.filter(t => !!t);
+  const flagIndex = d => d.map((item, i) => ({[i]: item}))
+  const remove = _ => null;
+  const append = d => d.join('');
+  const constructSet =  d => d.reduce((acc, t) => { 
+    if (t && t.type === 'setIdentifier')  acc.push({set: t})
+    if (t && t.length)                         acc[acc.length - 1].phones = t;
+    return acc;
+  }, []);
+  const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d)
 var grammar = {
    Lexer: lexer,
    ParserRules: [
-    {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement", {"literal":"\n"}]},
-    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1$subexpression$1"]},
-    {"name": "main$ebnf$1$subexpression$2", "symbols": ["statement", {"literal":"\n"}]},
-    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$2"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
-    {"name": "main", "symbols": ["main$ebnf$1"]},
-    {"name": "statement", "symbols": [{"literal":"foo"}]},
-    {"name": "statement", "symbols": [{"literal":"bar"}]}
+    {"name": "main$ebnf$1", "symbols": []},
+    {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement"]},
+    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "main", "symbols": ["main$ebnf$1"], "postprocess": compose(flag('main'), getTerminal)},
+    {"name": "_$ebnf$1$subexpression$1", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)]},
+    {"name": "_$ebnf$1", "symbols": ["_$ebnf$1$subexpression$1"], "postprocess": id},
+    {"name": "_$ebnf$1", "symbols": [], "postprocess": function(d) {return null;}},
+    {"name": "_", "symbols": ["_$ebnf$1"], "postprocess": remove},
+    {"name": "__", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)], "postprocess": remove},
+    {"name": "statement", "symbols": ["comment"]},
+    {"name": "statement", "symbols": ["definition"], "postprocess": compose(clearNull, getTerminal)},
+    {"name": "comment", "symbols": [(lexer.has("comment") ? {type: "comment"} : comment)], "postprocess": compose(remove, getTerminal)},
+    {"name": "definition", "symbols": [(lexer.has("kwSet") ? {type: "kwSet"} : kwSet), "__", "setDefinition"], "postprocess": d => ({token: 'setDefinition', sets: d[2]})},
+    {"name": "setDefinition$ebnf$1", "symbols": []},
+    {"name": "setDefinition$ebnf$1$subexpression$1", "symbols": [(lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression", (lexer.has("comma") ? {type: "comma"} : comma), "__"]},
+    {"name": "setDefinition$ebnf$1", "symbols": ["setDefinition$ebnf$1", "setDefinition$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "setDefinition", "symbols": ["setDefinition$ebnf$1", (lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression"], "postprocess": constructSet},
+    {"name": "setExpression", "symbols": [(lexer.has("openSquareBracket") ? {type: "openSquareBracket"} : openSquareBracket), "_", "phoneList", "_", (lexer.has("closeSquareBracket") ? {type: "closeSquareBracket"} : closeSquareBracket)], "postprocess": d => d.filter(t => t && t.length)},
+    {"name": "phoneList$ebnf$1", "symbols": []},
+    {"name": "phoneList$ebnf$1$subexpression$1", "symbols": [(lexer.has("phone") ? {type: "phone"} : phone), (lexer.has("comma") ? {type: "comma"} : comma), "_"]},
+    {"name": "phoneList$ebnf$1", "symbols": ["phoneList$ebnf$1", "phoneList$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "phoneList", "symbols": ["phoneList$ebnf$1", (lexer.has("phone") ? {type: "phone"} : phone)], "postprocess":  d => d.filter(t => t && (t.type === 'phone' || t.length) )
+        .map(t => {
+          if (!t.length) return t;
+          t.filter(st => st && st.type === 'phone')
+          return t;
+        }) }
 ]
  , ParserStart: "main"
 }
--- a/src/utils/latl/grammar.ne
+++ b/src/utils/latl/grammar.ne
@ -1,8 +1,79 @@
@{%
-  const lexer = require('./lexer');
+  const { lexer } = require('./lexer.js');
+  const getTerminal = d => d ? d[0] : null;
+  const getAll = d => d.map((item, i) => ({[i]: item}));
+  const flag = token => d => d.map(item => ({[token]: item}))
+  const clearNull = d => d.filter(t => !!t);
+  const flagIndex = d => d.map((item, i) => ({[i]: item}))
+  const remove = _ => null;
+  const append = d => d.join('');
+  const constructSet =  d => d.reduce((acc, t) => { 
+    if (t && t.type === 'setIdentifier')  acc.push({set: t})
+    if (t && t.length)                         acc[acc.length - 1].phones = t;
+    return acc;
+  }, []);
+  const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d)
 %}

@lexer lexer

-main        -> (statement "\n"):+
-statement   -> "foo" | "bar"
+main            -> (statement):* 
+  {% compose(flag('main'), getTerminal) %}
+
+_               -> (%whiteSpace):? 
+  {% remove %}
+
+__              -> %whiteSpace 
+  {% remove %}
+
+statement       -> comment | definition 
+  {% compose(clearNull, getTerminal) %}
+
+comment         -> %comment 
+  {% compose(remove, getTerminal) %}
+
+# SETS
+definition      -> %kwSet __ setDefinition {% d => ({token: 'setDefinition', sets: d[2]}) %}
+setDefinition   -> (%setIdentifier __ %equal __ setExpression %comma __):* %setIdentifier __ %equal __ setExpression
+  {% constructSet %}
+setExpression   -> %openSquareBracket _ phoneList _ %closeSquareBracket
+  {% d => d.filter(t => t && t.length) %}
+phoneList       -> (%phone %comma _):* %phone
+  {% d => d.filter(t => t && (t.type === 'phone' || t.length) )
+  .map(t => {
+    if (!t.length) return t;
+    t.filter(st => st && st.type === 'phone')
+    return t;
+  }) %}
+
+
+# assignmentExpression:
+# 	/*
+# 	 * SPEC:
+# 	 * conditionalExpression
+# 	 * | leftHandSideExpression assignmentOperator assignmentExpression
+# 	 */
+# 	(leftHandSideExpression assignmentOperator) =>
+# 	leftHandSideExpression assignmentOperator assignmentExpression
+# 	| conditionalExpression
+# 	;
+
+# assignmentExpressionNoln:
+# 	conditionalExpressionNoln
+# 	| leftHandSideExpression assignmentOperator assignmentExpressionNoln
+# 	;
+
+# assignmentOperator:
+# 	/* note that in the grammar these are listed out explicitely */
+# 	EQ | TIMESEQ | DIVIDEEQ | PERCENTEQ | PLUSEQ | MINUSEQ | LSHIFTEQ | RSHIFTEQ
+# 	| GT3EQ | AMPEREQ | CAROTEQ | PIPEEQ
+# 	;
+
+# expression:
+# 	/* 
+# 	 * SPEC:
+# 	 * assignmentExpression
+# 	 * | expression COMMA assignmentExpression
+# 	 */
+# 	assignmentExpression (expressionTail)*
+# 	;
--- a/src/utils/latl/lexer.js
+++ b/src/utils/latl/lexer.js
@ -1,17 +1,13 @@
 const moo = require('moo');

-export const lexer = moo.states({
+const lexer = moo.states({
  main: {
    comment:              /;.*$/,
    star:          { match: /\*/, push: 'epoch' },
    slash:                { match: /\//, push: 'lexicon' },
    // change so that identifiers are always upper, keywords are always lower, phones are always lower
-    'kw-set':             { match: 'set', type: moo.keywords({ 'kw-set': 'set '}), push: 'setDefinition'},
+    'kwSet':             { match: 'set', type: moo.keywords({ 'kwSet': 'set '}), push: 'setDefinition'},
    identifier:           { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, },
-    // type: moo.keywords({
-    //   'kw-set': 'set'
-    //   // { match: 'set', push: 'setDefinition' },
-    // })},
    openBracket:          { match: /\[/, push: 'feature' },
    whiteSpace:           { match: /\s+/, lineBreaks: true },
    newLine:              { match: /\n+/, lineBreaks: true }
@ -117,3 +113,5 @@ export const lexer = moo.states({
  }
  
 });
+
+module.exports = {lexer};
--- a/src/utils/latl/parser.js
+++ b/src/utils/latl/parser.js
@ -1,4 +1,4 @@
 const nearley = require("nearley");
 const grammar = require("./grammar.js");

-const parser = new nearley.Parser(nearley.Grammar.fromCompiled(grammar));
+export const parser = () => new nearley.Parser(nearley.Grammar.fromCompiled(grammar));
--- a/src/utils/latl/test/assertionData.js
+++ b/src/utils/latl/test/assertionData.js
@ -1,11 +1,25 @@
 export const assertionData = {
-  setDefinition: {
+  simpleComment: {
+    latl: `; comment`,
+    tokens: [
+      { type: 'comment', value: '; comment'}
+    ]
+  },
+  simpleSetDefinition: {
+    latl: `set PLOSIVES`,
+    tokens: [
+      { type: 'kwSet', value: 'set' }, 
+      { type: 'whiteSpace', value: ' ' }, 
+      { type: 'setIdentifier', value: 'PLOSIVES' }
+    ]
+  },
+  commaSetDefinition: {
    latl: `
 set NASAL_PULMONIC_CONSONANTS               = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ],
    STOP_PULMONIC_CONSONANTS                = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ]`,
    tokens: [
      { type: 'whiteSpace', value: '\n' },
-      { type: 'kw-set', value: 'set' },
+      { type: 'kwSet', value: 'set' },
      { type: 'whiteSpace', value: ' ' },
      { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' },
      { type: 'whiteSpace', value: '               ' },
@ -121,7 +135,7 @@ set NASAL_PULMONIC_CONSONANTS               = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊,
 set NASAL_PULMONIC_CONSONANTS, N            = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ]`,
    tokens: [  
      { type: 'whiteSpace', value: '\n' },
-      { type: 'kw-set', value: 'set' },
+      { type: 'kwSet', value: 'set' },
      { type: 'whiteSpace', value: ' ' },
      { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' },
      { type: 'comma', value: ',' },
@ -180,7 +194,7 @@ set CLICK_CONSONANTS  = { TENUIS_CLICK_CONSONANTS or VOICED_CLICK_CONSONANTS
                        }`,
    tokens: [
      { type: 'whiteSpace', value: '\n'}, 
-      { type: 'kw-set', value: 'set'}, 
+      { type: 'kwSet', value: 'set'}, 
      { type: 'whiteSpace', value: ' '}, 
      { type: 'setIdentifier', value: 'CLICK_CONSONANTS'}, 
      { type: 'whiteSpace', value: '  '}, 
@ -215,7 +229,7 @@ set NASAL_VOWELS                      = { [ V ] in ORAL_VOWELS yield [ Ṽ ] },
    LONG_NASAL_VOWELS                 = { [ Vː ] in NASAL_VOWELS }`,
    tokens: [ 
      { type: 'whiteSpace', value: '\n',  }, 
-      { type: 'kw-set', value: 'set',  }, 
+      { type: 'kwSet', value: 'set',  }, 
      { type: 'whiteSpace', value: ' ',  }, 
      { type: 'setIdentifier', value: 'NASAL_VOWELS',  }, 
      { type: 'whiteSpace', value: '                      ',  }, 
@ -307,7 +321,7 @@ set SET_C = { SET_A not SET_B }, ;  left anti join
      { type: 'whiteSpace', value: '\n',  },
      { type: 'comment', value: '; ---- set join operations non-mutable! ',  },
      { type: 'whiteSpace', value: '\n',  },
-      { type: 'kw-set', value: 'set',  },
+      { type: 'kwSet', value: 'set',  },
      { type: 'whiteSpace', value: ' ',  },
      { type: 'setIdentifier', value: 'SET_C',  },
      { type: 'whiteSpace', value: ' ',  },
@ -406,7 +420,7 @@ set SET_B = { [ Xy ] in SET_A },    ; FILTER:       where X is any character and
      { type: 'whiteSpace', value: '\n',  },
      { type: 'comment', value: '; ---- set character operations - non-mutable!',  },
      { type: 'whiteSpace', value: '\n',  },
-      { type: 'kw-set', value: 'set',  },
+      { type: 'kwSet', value: 'set',  },
      { type: 'whiteSpace', value: ' ',  },
      { type: 'setIdentifier', value: 'SET_B',  },
      { type: 'whiteSpace', value: ' ',  },
--- a/src/utils/latl/test/lexer.test.js
+++ b/src/utils/latl/test/lexer.test.js
@ -16,25 +16,25 @@ describe('lexer', () => {
  }

  it('lexes simple comment', () => {
-    lexer.reset('; comment');
-    const token = lexer.next();
-    expect(getToken(token)).toStrictEqual({ type: 'comment', value: '; comment'});
+    const { latl, tokens } = assertionData.simpleComment;
+    const stream           = getStream(latl);
+    expect(stream).toStrictEqual(tokens);
  });

-  it('lexes simple * and identifier', () => {
-    lexer.reset('*proto');
-    const stream = [ getToken(lexer.next()), getToken(lexer.next()) ];
-    expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]);
-  })
+  // it('lexes simple * and identifier', () => {
+  //   lexer.reset('*proto');
+  //   const stream = [ getToken(lexer.next()), getToken(lexer.next()) ];
+  //   expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]);
+  // })

  it('lexes set and identifier', () => {
-    lexer.reset('set PLOSIVES');
-    const stream = [ getToken(lexer.next()), getToken(lexer.next()), getToken(lexer.next()) ];
-    expect(stream).toStrictEqual([ { type: 'kw-set', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'PLOSIVES' } ]);
+    const { latl, tokens } = assertionData.simpleSetDefinition;
+    const stream           = getStream(latl);
+    expect(stream).toStrictEqual(tokens);
  })

  it('lexes multiple set definitions with comma operator', () => {
-    const { latl, tokens } = assertionData.setDefinition;
+    const { latl, tokens } = assertionData.commaSetDefinition;
    const stream           = getStream(latl);
    expect(stream).toStrictEqual(tokens);
  });
--- a/src/utils/latl/test/parser.test.js
+++ b/src/utils/latl/test/parser.test.js
@ -0,0 +1,49 @@
+import { lexer } from '../lexer';
+import { parser } from '../parser';
+import { assertionData } from './assertionData';
+
+describe('parser', () => {
+  it('parses simple comment', () => {
+    const { latl } = assertionData.simpleComment;
+    const AST = parser().feed(latl).results;
+    expect(AST.length).toBe(1);
+    console.log(AST[0])
+    // expect(AST[0]).toStrictEqual()
+  })
+
+  // it('parses multiple set definitions with comma operator', () => {
+  //   const { latl } = assertionData.commaSetDefinition;
+  //   const AST = parser().feed(latl)
+  //   console.log(AST)
+  // });
+
+  // it('lexes set definition with alias', () => {
+  //   const { latl, tokens } = assertionData.setAliasDefinition;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set definition with set join', () => {
+  //   const { latl, tokens } = assertionData.setDefinitionJoin;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set definition with yield operation', () => {
+  //   const { latl, tokens } = assertionData.setDefinitionYield;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes all set join operations', () => {
+  //   const { latl, tokens } = assertionData.setOperationsJoin;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set filter, concat, and dissoc operations', () => {
+  //   const { latl, tokens } = assertionData.setOperations;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // })
+})