From aa19d42a11e34d9932e49320eaad4d3b450b3209 Mon Sep 17 00:00:00 2001
From: Sorrel Bri <sorrel.bri.june@gmail.com>
Date: Fri, 27 Mar 2020 15:55:45 -0700
Subject: [PATCH] stub parser

---
 package.json                         |  3 +-
 src/utils/grammar.js                 |  0
 src/utils/latl/grammar.js            | 49 +++++++++++++++---
 src/utils/latl/grammar.ne            | 77 ++++++++++++++++++++++++++--
 src/utils/latl/lexer.js              | 12 ++---
 src/utils/latl/parser.js             |  2 +-
 src/utils/latl/test/assertionData.js | 28 +++++++---
 src/utils/latl/test/lexer.test.js    | 24 ++++-----
 src/utils/latl/test/parser.test.js   | 49 ++++++++++++++++++
 9 files changed, 205 insertions(+), 39 deletions(-)
 create mode 100644 src/utils/grammar.js
 create mode 100644 src/utils/latl/test/parser.test.js

diff --git a/package.json b/package.json
index 48fcadd..c71741a 100644
--- a/package.json
+++ b/package.json
@@ -17,7 +17,8 @@
   },
   "scripts": {
     "start": "react-scripts start",
-    "compile-grammar": "nearleyc src/utils/grammar.ne -o src/utils/grammar.js",
+    "compile-grammar": "nearleyc src/utils/latl/grammar.ne -o src/utils/latl/grammar.js",
+    "test-grammar": "nearley-test src/utils/latl/grammar.js --input",
     "flow": "flow",
     "build": "react-scripts build",
     "test": "react-scripts test",
diff --git a/src/utils/grammar.js b/src/utils/grammar.js
new file mode 100644
index 0000000..e69de29
diff --git a/src/utils/latl/grammar.js b/src/utils/latl/grammar.js
index c3f4012..9e51a0c 100644
--- a/src/utils/latl/grammar.js
+++ b/src/utils/latl/grammar.js
@@ -3,17 +3,50 @@
 (function () {
 function id(x) { return x[0]; }
 
-  const lexer = require('./lexer');
+  const { lexer } = require('./lexer.js');
+  const getTerminal = d => d ? d[0] : null;
+  const getAll = d => d.map((item, i) => ({[i]: item}));
+  const flag = token => d => d.map(item => ({[token]: item}))
+  const clearNull = d => d.filter(t => !!t);
+  const flagIndex = d => d.map((item, i) => ({[i]: item}))
+  const remove = _ => null;
+  const append = d => d.join('');
+  const constructSet =  d => d.reduce((acc, t) => { 
+    if (t && t.type === 'setIdentifier')  acc.push({set: t})
+    if (t && t.length)                         acc[acc.length - 1].phones = t;
+    return acc;
+  }, []);
+  const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d)
 var grammar = {
     Lexer: lexer,
     ParserRules: [
-    {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement", {"literal":"\n"}]},
-    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1$subexpression$1"]},
-    {"name": "main$ebnf$1$subexpression$2", "symbols": ["statement", {"literal":"\n"}]},
-    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$2"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
-    {"name": "main", "symbols": ["main$ebnf$1"]},
-    {"name": "statement", "symbols": [{"literal":"foo"}]},
-    {"name": "statement", "symbols": [{"literal":"bar"}]}
+    {"name": "main$ebnf$1", "symbols": []},
+    {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement"]},
+    {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "main", "symbols": ["main$ebnf$1"], "postprocess": compose(flag('main'), getTerminal)},
+    {"name": "_$ebnf$1$subexpression$1", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)]},
+    {"name": "_$ebnf$1", "symbols": ["_$ebnf$1$subexpression$1"], "postprocess": id},
+    {"name": "_$ebnf$1", "symbols": [], "postprocess": function(d) {return null;}},
+    {"name": "_", "symbols": ["_$ebnf$1"], "postprocess": remove},
+    {"name": "__", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)], "postprocess": remove},
+    {"name": "statement", "symbols": ["comment"]},
+    {"name": "statement", "symbols": ["definition"], "postprocess": compose(clearNull, getTerminal)},
+    {"name": "comment", "symbols": [(lexer.has("comment") ? {type: "comment"} : comment)], "postprocess": compose(remove, getTerminal)},
+    {"name": "definition", "symbols": [(lexer.has("kwSet") ? {type: "kwSet"} : kwSet), "__", "setDefinition"], "postprocess": d => ({token: 'setDefinition', sets: d[2]})},
+    {"name": "setDefinition$ebnf$1", "symbols": []},
+    {"name": "setDefinition$ebnf$1$subexpression$1", "symbols": [(lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression", (lexer.has("comma") ? {type: "comma"} : comma), "__"]},
+    {"name": "setDefinition$ebnf$1", "symbols": ["setDefinition$ebnf$1", "setDefinition$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "setDefinition", "symbols": ["setDefinition$ebnf$1", (lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression"], "postprocess": constructSet},
+    {"name": "setExpression", "symbols": [(lexer.has("openSquareBracket") ? {type: "openSquareBracket"} : openSquareBracket), "_", "phoneList", "_", (lexer.has("closeSquareBracket") ? {type: "closeSquareBracket"} : closeSquareBracket)], "postprocess": d => d.filter(t => t && t.length)},
+    {"name": "phoneList$ebnf$1", "symbols": []},
+    {"name": "phoneList$ebnf$1$subexpression$1", "symbols": [(lexer.has("phone") ? {type: "phone"} : phone), (lexer.has("comma") ? {type: "comma"} : comma), "_"]},
+    {"name": "phoneList$ebnf$1", "symbols": ["phoneList$ebnf$1", "phoneList$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "phoneList", "symbols": ["phoneList$ebnf$1", (lexer.has("phone") ? {type: "phone"} : phone)], "postprocess":  d => d.filter(t => t && (t.type === 'phone' || t.length) )
+        .map(t => {
+          if (!t.length) return t;
+          t.filter(st => st && st.type === 'phone')
+          return t;
+        }) }
 ]
   , ParserStart: "main"
 }
diff --git a/src/utils/latl/grammar.ne b/src/utils/latl/grammar.ne
index 16fbee1..fff5adb 100644
--- a/src/utils/latl/grammar.ne
+++ b/src/utils/latl/grammar.ne
@@ -1,8 +1,79 @@
 @{%
-  const lexer = require('./lexer');
+  const { lexer } = require('./lexer.js');
+  const getTerminal = d => d ? d[0] : null;
+  const getAll = d => d.map((item, i) => ({[i]: item}));
+  const flag = token => d => d.map(item => ({[token]: item}))
+  const clearNull = d => d.filter(t => !!t);
+  const flagIndex = d => d.map((item, i) => ({[i]: item}))
+  const remove = _ => null;
+  const append = d => d.join('');
+  const constructSet =  d => d.reduce((acc, t) => { 
+    if (t && t.type === 'setIdentifier')  acc.push({set: t})
+    if (t && t.length)                         acc[acc.length - 1].phones = t;
+    return acc;
+  }, []);
+  const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d)
 %}
 
 @lexer lexer
 
-main        -> (statement "\n"):+
-statement   -> "foo" | "bar"
\ No newline at end of file
+main            -> (statement):* 
+  {% compose(flag('main'), getTerminal) %}
+
+_               -> (%whiteSpace):? 
+  {% remove %}
+
+__              -> %whiteSpace 
+  {% remove %}
+
+statement       -> comment | definition 
+  {% compose(clearNull, getTerminal) %}
+
+comment         -> %comment 
+  {% compose(remove, getTerminal) %}
+
+# SETS
+definition      -> %kwSet __ setDefinition {% d => ({token: 'setDefinition', sets: d[2]}) %}
+setDefinition   -> (%setIdentifier __ %equal __ setExpression %comma __):* %setIdentifier __ %equal __ setExpression
+  {% constructSet %}
+setExpression   -> %openSquareBracket _ phoneList _ %closeSquareBracket
+  {% d => d.filter(t => t && t.length) %}
+phoneList       -> (%phone %comma _):* %phone
+  {% d => d.filter(t => t && (t.type === 'phone' || t.length) )
+  .map(t => {
+    if (!t.length) return t;
+    t.filter(st => st && st.type === 'phone')
+    return t;
+  }) %}
+
+
+# assignmentExpression:
+# 	/*
+# 	 * SPEC:
+# 	 * conditionalExpression
+# 	 * | leftHandSideExpression assignmentOperator assignmentExpression
+# 	 */
+# 	(leftHandSideExpression assignmentOperator) =>
+# 	leftHandSideExpression assignmentOperator assignmentExpression
+# 	| conditionalExpression
+# 	;
+
+# assignmentExpressionNoln:
+# 	conditionalExpressionNoln
+# 	| leftHandSideExpression assignmentOperator assignmentExpressionNoln
+# 	;
+
+# assignmentOperator:
+# 	/* note that in the grammar these are listed out explicitely */
+# 	EQ | TIMESEQ | DIVIDEEQ | PERCENTEQ | PLUSEQ | MINUSEQ | LSHIFTEQ | RSHIFTEQ
+# 	| GT3EQ | AMPEREQ | CAROTEQ | PIPEEQ
+# 	;
+
+# expression:
+# 	/* 
+# 	 * SPEC:
+# 	 * assignmentExpression
+# 	 * | expression COMMA assignmentExpression
+# 	 */
+# 	assignmentExpression (expressionTail)*
+# 	;
\ No newline at end of file
diff --git a/src/utils/latl/lexer.js b/src/utils/latl/lexer.js
index 995aac7..bae2c3f 100644
--- a/src/utils/latl/lexer.js
+++ b/src/utils/latl/lexer.js
@@ -1,17 +1,13 @@
 const moo = require('moo');
 
-export const lexer = moo.states({
+const lexer = moo.states({
   main: {
     comment:              /;.*$/,
     star:          { match: /\*/, push: 'epoch' },
     slash:                { match: /\//, push: 'lexicon' },
     // change so that identifiers are always upper, keywords are always lower, phones are always lower
-    'kw-set':             { match: 'set', type: moo.keywords({ 'kw-set': 'set '}), push: 'setDefinition'},
+    'kwSet':             { match: 'set', type: moo.keywords({ 'kwSet': 'set '}), push: 'setDefinition'},
     identifier:           { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, },
-    // type: moo.keywords({
-    //   'kw-set': 'set'
-    //   // { match: 'set', push: 'setDefinition' },
-    // })},
     openBracket:          { match: /\[/, push: 'feature' },
     whiteSpace:           { match: /\s+/, lineBreaks: true },
     newLine:              { match: /\n+/, lineBreaks: true }
@@ -116,4 +112,6 @@ export const lexer = moo.states({
     newLine:              { match: /\n/, lineBreaks: true, pop: true }
   }
   
-});
\ No newline at end of file
+});
+
+module.exports = {lexer};
\ No newline at end of file
diff --git a/src/utils/latl/parser.js b/src/utils/latl/parser.js
index 62d56fa..a7f5143 100644
--- a/src/utils/latl/parser.js
+++ b/src/utils/latl/parser.js
@@ -1,4 +1,4 @@
 const nearley = require("nearley");
 const grammar = require("./grammar.js");
 
-const parser = new nearley.Parser(nearley.Grammar.fromCompiled(grammar));
\ No newline at end of file
+export const parser = () => new nearley.Parser(nearley.Grammar.fromCompiled(grammar));
\ No newline at end of file
diff --git a/src/utils/latl/test/assertionData.js b/src/utils/latl/test/assertionData.js
index c224ef6..8b927e6 100644
--- a/src/utils/latl/test/assertionData.js
+++ b/src/utils/latl/test/assertionData.js
@@ -1,11 +1,25 @@
 export const assertionData = {
-  setDefinition: {
+  simpleComment: {
+    latl: `; comment`,
+    tokens: [
+      { type: 'comment', value: '; comment'}
+    ]
+  },
+  simpleSetDefinition: {
+    latl: `set PLOSIVES`,
+    tokens: [
+      { type: 'kwSet', value: 'set' }, 
+      { type: 'whiteSpace', value: ' ' }, 
+      { type: 'setIdentifier', value: 'PLOSIVES' }
+    ]
+  },
+  commaSetDefinition: {
     latl: `
 set NASAL_PULMONIC_CONSONANTS               = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ],
     STOP_PULMONIC_CONSONANTS                = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ]`,
     tokens: [
       { type: 'whiteSpace', value: '\n' },
-      { type: 'kw-set', value: 'set' },
+      { type: 'kwSet', value: 'set' },
       { type: 'whiteSpace', value: ' ' },
       { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' },
       { type: 'whiteSpace', value: '               ' },
@@ -121,7 +135,7 @@ set NASAL_PULMONIC_CONSONANTS               = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, 
 set NASAL_PULMONIC_CONSONANTS, N            = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ]`,
     tokens: [  
       { type: 'whiteSpace', value: '\n' },
-      { type: 'kw-set', value: 'set' },
+      { type: 'kwSet', value: 'set' },
       { type: 'whiteSpace', value: ' ' },
       { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' },
       { type: 'comma', value: ',' },
@@ -180,7 +194,7 @@ set CLICK_CONSONANTS  = { TENUIS_CLICK_CONSONANTS or VOICED_CLICK_CONSONANTS
                         }`,
     tokens: [
       { type: 'whiteSpace', value: '\n'}, 
-      { type: 'kw-set', value: 'set'}, 
+      { type: 'kwSet', value: 'set'}, 
       { type: 'whiteSpace', value: ' '}, 
       { type: 'setIdentifier', value: 'CLICK_CONSONANTS'}, 
       { type: 'whiteSpace', value: '  '}, 
@@ -215,7 +229,7 @@ set NASAL_VOWELS                      = { [ V ] in ORAL_VOWELS yield [ Ṽ ] },
     LONG_NASAL_VOWELS                 = { [ Vː ] in NASAL_VOWELS }`,
     tokens: [ 
       { type: 'whiteSpace', value: '\n',  }, 
-      { type: 'kw-set', value: 'set',  }, 
+      { type: 'kwSet', value: 'set',  }, 
       { type: 'whiteSpace', value: ' ',  }, 
       { type: 'setIdentifier', value: 'NASAL_VOWELS',  }, 
       { type: 'whiteSpace', value: '                      ',  }, 
@@ -307,7 +321,7 @@ set SET_C = { SET_A not SET_B }, ;  left anti join
       { type: 'whiteSpace', value: '\n',  },
       { type: 'comment', value: '; ---- set join operations non-mutable! ',  },
       { type: 'whiteSpace', value: '\n',  },
-      { type: 'kw-set', value: 'set',  },
+      { type: 'kwSet', value: 'set',  },
       { type: 'whiteSpace', value: ' ',  },
       { type: 'setIdentifier', value: 'SET_C',  },
       { type: 'whiteSpace', value: ' ',  },
@@ -406,7 +420,7 @@ set SET_B = { [ Xy ] in SET_A },    ; FILTER:       where X is any character and
       { type: 'whiteSpace', value: '\n',  },
       { type: 'comment', value: '; ---- set character operations - non-mutable!',  },
       { type: 'whiteSpace', value: '\n',  },
-      { type: 'kw-set', value: 'set',  },
+      { type: 'kwSet', value: 'set',  },
       { type: 'whiteSpace', value: ' ',  },
       { type: 'setIdentifier', value: 'SET_B',  },
       { type: 'whiteSpace', value: ' ',  },
diff --git a/src/utils/latl/test/lexer.test.js b/src/utils/latl/test/lexer.test.js
index d0cb9ea..aa96096 100644
--- a/src/utils/latl/test/lexer.test.js
+++ b/src/utils/latl/test/lexer.test.js
@@ -16,25 +16,25 @@ describe('lexer', () => {
   }
 
   it('lexes simple comment', () => {
-    lexer.reset('; comment');
-    const token = lexer.next();
-    expect(getToken(token)).toStrictEqual({ type: 'comment', value: '; comment'});
+    const { latl, tokens } = assertionData.simpleComment;
+    const stream           = getStream(latl);
+    expect(stream).toStrictEqual(tokens);
   });
 
-  it('lexes simple * and identifier', () => {
-    lexer.reset('*proto');
-    const stream = [ getToken(lexer.next()), getToken(lexer.next()) ];
-    expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]);
-  })
+  // it('lexes simple * and identifier', () => {
+  //   lexer.reset('*proto');
+  //   const stream = [ getToken(lexer.next()), getToken(lexer.next()) ];
+  //   expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]);
+  // })
 
   it('lexes set and identifier', () => {
-    lexer.reset('set PLOSIVES');
-    const stream = [ getToken(lexer.next()), getToken(lexer.next()), getToken(lexer.next()) ];
-    expect(stream).toStrictEqual([ { type: 'kw-set', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'PLOSIVES' } ]);
+    const { latl, tokens } = assertionData.simpleSetDefinition;
+    const stream           = getStream(latl);
+    expect(stream).toStrictEqual(tokens);
   })
 
   it('lexes multiple set definitions with comma operator', () => {
-    const { latl, tokens } = assertionData.setDefinition;
+    const { latl, tokens } = assertionData.commaSetDefinition;
     const stream           = getStream(latl);
     expect(stream).toStrictEqual(tokens);
   });
diff --git a/src/utils/latl/test/parser.test.js b/src/utils/latl/test/parser.test.js
new file mode 100644
index 0000000..2b36d59
--- /dev/null
+++ b/src/utils/latl/test/parser.test.js
@@ -0,0 +1,49 @@
+import { lexer } from '../lexer';
+import { parser } from '../parser';
+import { assertionData } from './assertionData';
+
+describe('parser', () => {
+  it('parses simple comment', () => {
+    const { latl } = assertionData.simpleComment;
+    const AST = parser().feed(latl).results;
+    expect(AST.length).toBe(1);
+    console.log(AST[0])
+    // expect(AST[0]).toStrictEqual()
+  })
+
+  // it('parses multiple set definitions with comma operator', () => {
+  //   const { latl } = assertionData.commaSetDefinition;
+  //   const AST = parser().feed(latl)
+  //   console.log(AST)
+  // });
+
+  // it('lexes set definition with alias', () => {
+  //   const { latl, tokens } = assertionData.setAliasDefinition;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set definition with set join', () => {
+  //   const { latl, tokens } = assertionData.setDefinitionJoin;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set definition with yield operation', () => {
+  //   const { latl, tokens } = assertionData.setDefinitionYield;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes all set join operations', () => {
+  //   const { latl, tokens } = assertionData.setOperationsJoin;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // });
+
+  // it('lexes set filter, concat, and dissoc operations', () => {
+  //   const { latl, tokens } = assertionData.setOperations;
+  //   const stream           = getStream(latl);
+  //   expect(stream).toStrictEqual(tokens);
+  // })
+})