From aa19d42a11e34d9932e49320eaad4d3b450b3209 Mon Sep 17 00:00:00 2001 From: Sorrel Bri Date: Fri, 27 Mar 2020 15:55:45 -0700 Subject: [PATCH] stub parser --- package.json | 3 +- src/utils/grammar.js | 0 src/utils/latl/grammar.js | 49 +++++++++++++++--- src/utils/latl/grammar.ne | 77 ++++++++++++++++++++++++++-- src/utils/latl/lexer.js | 12 ++--- src/utils/latl/parser.js | 2 +- src/utils/latl/test/assertionData.js | 28 +++++++--- src/utils/latl/test/lexer.test.js | 24 ++++----- src/utils/latl/test/parser.test.js | 49 ++++++++++++++++++ 9 files changed, 205 insertions(+), 39 deletions(-) create mode 100644 src/utils/grammar.js create mode 100644 src/utils/latl/test/parser.test.js diff --git a/package.json b/package.json index 48fcadd..c71741a 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ }, "scripts": { "start": "react-scripts start", - "compile-grammar": "nearleyc src/utils/grammar.ne -o src/utils/grammar.js", + "compile-grammar": "nearleyc src/utils/latl/grammar.ne -o src/utils/latl/grammar.js", + "test-grammar": "nearley-test src/utils/latl/grammar.js --input", "flow": "flow", "build": "react-scripts build", "test": "react-scripts test", diff --git a/src/utils/grammar.js b/src/utils/grammar.js new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/latl/grammar.js b/src/utils/latl/grammar.js index c3f4012..9e51a0c 100644 --- a/src/utils/latl/grammar.js +++ b/src/utils/latl/grammar.js @@ -3,17 +3,50 @@ (function () { function id(x) { return x[0]; } - const lexer = require('./lexer'); + const { lexer } = require('./lexer.js'); + const getTerminal = d => d ? d[0] : null; + const getAll = d => d.map((item, i) => ({[i]: item})); + const flag = token => d => d.map(item => ({[token]: item})) + const clearNull = d => d.filter(t => !!t); + const flagIndex = d => d.map((item, i) => ({[i]: item})) + const remove = _ => null; + const append = d => d.join(''); + const constructSet = d => d.reduce((acc, t) => { + if (t && t.type === 'setIdentifier') acc.push({set: t}) + if (t && t.length) acc[acc.length - 1].phones = t; + return acc; + }, []); + const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d) var grammar = { Lexer: lexer, ParserRules: [ - {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement", {"literal":"\n"}]}, - {"name": "main$ebnf$1", "symbols": ["main$ebnf$1$subexpression$1"]}, - {"name": "main$ebnf$1$subexpression$2", "symbols": ["statement", {"literal":"\n"}]}, - {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$2"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}}, - {"name": "main", "symbols": ["main$ebnf$1"]}, - {"name": "statement", "symbols": [{"literal":"foo"}]}, - {"name": "statement", "symbols": [{"literal":"bar"}]} + {"name": "main$ebnf$1", "symbols": []}, + {"name": "main$ebnf$1$subexpression$1", "symbols": ["statement"]}, + {"name": "main$ebnf$1", "symbols": ["main$ebnf$1", "main$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}}, + {"name": "main", "symbols": ["main$ebnf$1"], "postprocess": compose(flag('main'), getTerminal)}, + {"name": "_$ebnf$1$subexpression$1", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)]}, + {"name": "_$ebnf$1", "symbols": ["_$ebnf$1$subexpression$1"], "postprocess": id}, + {"name": "_$ebnf$1", "symbols": [], "postprocess": function(d) {return null;}}, + {"name": "_", "symbols": ["_$ebnf$1"], "postprocess": remove}, + {"name": "__", "symbols": [(lexer.has("whiteSpace") ? {type: "whiteSpace"} : whiteSpace)], "postprocess": remove}, + {"name": "statement", "symbols": ["comment"]}, + {"name": "statement", "symbols": ["definition"], "postprocess": compose(clearNull, getTerminal)}, + {"name": "comment", "symbols": [(lexer.has("comment") ? {type: "comment"} : comment)], "postprocess": compose(remove, getTerminal)}, + {"name": "definition", "symbols": [(lexer.has("kwSet") ? {type: "kwSet"} : kwSet), "__", "setDefinition"], "postprocess": d => ({token: 'setDefinition', sets: d[2]})}, + {"name": "setDefinition$ebnf$1", "symbols": []}, + {"name": "setDefinition$ebnf$1$subexpression$1", "symbols": [(lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression", (lexer.has("comma") ? {type: "comma"} : comma), "__"]}, + {"name": "setDefinition$ebnf$1", "symbols": ["setDefinition$ebnf$1", "setDefinition$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}}, + {"name": "setDefinition", "symbols": ["setDefinition$ebnf$1", (lexer.has("setIdentifier") ? {type: "setIdentifier"} : setIdentifier), "__", (lexer.has("equal") ? {type: "equal"} : equal), "__", "setExpression"], "postprocess": constructSet}, + {"name": "setExpression", "symbols": [(lexer.has("openSquareBracket") ? {type: "openSquareBracket"} : openSquareBracket), "_", "phoneList", "_", (lexer.has("closeSquareBracket") ? {type: "closeSquareBracket"} : closeSquareBracket)], "postprocess": d => d.filter(t => t && t.length)}, + {"name": "phoneList$ebnf$1", "symbols": []}, + {"name": "phoneList$ebnf$1$subexpression$1", "symbols": [(lexer.has("phone") ? {type: "phone"} : phone), (lexer.has("comma") ? {type: "comma"} : comma), "_"]}, + {"name": "phoneList$ebnf$1", "symbols": ["phoneList$ebnf$1", "phoneList$ebnf$1$subexpression$1"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}}, + {"name": "phoneList", "symbols": ["phoneList$ebnf$1", (lexer.has("phone") ? {type: "phone"} : phone)], "postprocess": d => d.filter(t => t && (t.type === 'phone' || t.length) ) + .map(t => { + if (!t.length) return t; + t.filter(st => st && st.type === 'phone') + return t; + }) } ] , ParserStart: "main" } diff --git a/src/utils/latl/grammar.ne b/src/utils/latl/grammar.ne index 16fbee1..fff5adb 100644 --- a/src/utils/latl/grammar.ne +++ b/src/utils/latl/grammar.ne @@ -1,8 +1,79 @@ @{% - const lexer = require('./lexer'); + const { lexer } = require('./lexer.js'); + const getTerminal = d => d ? d[0] : null; + const getAll = d => d.map((item, i) => ({[i]: item})); + const flag = token => d => d.map(item => ({[token]: item})) + const clearNull = d => d.filter(t => !!t); + const flagIndex = d => d.map((item, i) => ({[i]: item})) + const remove = _ => null; + const append = d => d.join(''); + const constructSet = d => d.reduce((acc, t) => { + if (t && t.type === 'setIdentifier') acc.push({set: t}) + if (t && t.length) acc[acc.length - 1].phones = t; + return acc; + }, []); + const compose = (...funcs) => d => funcs.reduce((acc, func) => func(acc), d) %} @lexer lexer -main -> (statement "\n"):+ -statement -> "foo" | "bar" \ No newline at end of file +main -> (statement):* + {% compose(flag('main'), getTerminal) %} + +_ -> (%whiteSpace):? + {% remove %} + +__ -> %whiteSpace + {% remove %} + +statement -> comment | definition + {% compose(clearNull, getTerminal) %} + +comment -> %comment + {% compose(remove, getTerminal) %} + +# SETS +definition -> %kwSet __ setDefinition {% d => ({token: 'setDefinition', sets: d[2]}) %} +setDefinition -> (%setIdentifier __ %equal __ setExpression %comma __):* %setIdentifier __ %equal __ setExpression + {% constructSet %} +setExpression -> %openSquareBracket _ phoneList _ %closeSquareBracket + {% d => d.filter(t => t && t.length) %} +phoneList -> (%phone %comma _):* %phone + {% d => d.filter(t => t && (t.type === 'phone' || t.length) ) + .map(t => { + if (!t.length) return t; + t.filter(st => st && st.type === 'phone') + return t; + }) %} + + +# assignmentExpression: +# /* +# * SPEC: +# * conditionalExpression +# * | leftHandSideExpression assignmentOperator assignmentExpression +# */ +# (leftHandSideExpression assignmentOperator) => +# leftHandSideExpression assignmentOperator assignmentExpression +# | conditionalExpression +# ; + +# assignmentExpressionNoln: +# conditionalExpressionNoln +# | leftHandSideExpression assignmentOperator assignmentExpressionNoln +# ; + +# assignmentOperator: +# /* note that in the grammar these are listed out explicitely */ +# EQ | TIMESEQ | DIVIDEEQ | PERCENTEQ | PLUSEQ | MINUSEQ | LSHIFTEQ | RSHIFTEQ +# | GT3EQ | AMPEREQ | CAROTEQ | PIPEEQ +# ; + +# expression: +# /* +# * SPEC: +# * assignmentExpression +# * | expression COMMA assignmentExpression +# */ +# assignmentExpression (expressionTail)* +# ; \ No newline at end of file diff --git a/src/utils/latl/lexer.js b/src/utils/latl/lexer.js index 995aac7..bae2c3f 100644 --- a/src/utils/latl/lexer.js +++ b/src/utils/latl/lexer.js @@ -1,17 +1,13 @@ const moo = require('moo'); -export const lexer = moo.states({ +const lexer = moo.states({ main: { comment: /;.*$/, star: { match: /\*/, push: 'epoch' }, slash: { match: /\//, push: 'lexicon' }, // change so that identifiers are always upper, keywords are always lower, phones are always lower - 'kw-set': { match: 'set', type: moo.keywords({ 'kw-set': 'set '}), push: 'setDefinition'}, + 'kwSet': { match: 'set', type: moo.keywords({ 'kwSet': 'set '}), push: 'setDefinition'}, identifier: { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, }, - // type: moo.keywords({ - // 'kw-set': 'set' - // // { match: 'set', push: 'setDefinition' }, - // })}, openBracket: { match: /\[/, push: 'feature' }, whiteSpace: { match: /\s+/, lineBreaks: true }, newLine: { match: /\n+/, lineBreaks: true } @@ -116,4 +112,6 @@ export const lexer = moo.states({ newLine: { match: /\n/, lineBreaks: true, pop: true } } -}); \ No newline at end of file +}); + +module.exports = {lexer}; \ No newline at end of file diff --git a/src/utils/latl/parser.js b/src/utils/latl/parser.js index 62d56fa..a7f5143 100644 --- a/src/utils/latl/parser.js +++ b/src/utils/latl/parser.js @@ -1,4 +1,4 @@ const nearley = require("nearley"); const grammar = require("./grammar.js"); -const parser = new nearley.Parser(nearley.Grammar.fromCompiled(grammar)); \ No newline at end of file +export const parser = () => new nearley.Parser(nearley.Grammar.fromCompiled(grammar)); \ No newline at end of file diff --git a/src/utils/latl/test/assertionData.js b/src/utils/latl/test/assertionData.js index c224ef6..8b927e6 100644 --- a/src/utils/latl/test/assertionData.js +++ b/src/utils/latl/test/assertionData.js @@ -1,11 +1,25 @@ export const assertionData = { - setDefinition: { + simpleComment: { + latl: `; comment`, + tokens: [ + { type: 'comment', value: '; comment'} + ] + }, + simpleSetDefinition: { + latl: `set PLOSIVES`, + tokens: [ + { type: 'kwSet', value: 'set' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'setIdentifier', value: 'PLOSIVES' } + ] + }, + commaSetDefinition: { latl: ` set NASAL_PULMONIC_CONSONANTS = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ], STOP_PULMONIC_CONSONANTS = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ]`, tokens: [ { type: 'whiteSpace', value: '\n' }, - { type: 'kw-set', value: 'set' }, + { type: 'kwSet', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' }, { type: 'whiteSpace', value: ' ' }, @@ -121,7 +135,7 @@ set NASAL_PULMONIC_CONSONANTS = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, set NASAL_PULMONIC_CONSONANTS, N = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ]`, tokens: [ { type: 'whiteSpace', value: '\n' }, - { type: 'kw-set', value: 'set' }, + { type: 'kwSet', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' }, { type: 'comma', value: ',' }, @@ -180,7 +194,7 @@ set CLICK_CONSONANTS = { TENUIS_CLICK_CONSONANTS or VOICED_CLICK_CONSONANTS }`, tokens: [ { type: 'whiteSpace', value: '\n'}, - { type: 'kw-set', value: 'set'}, + { type: 'kwSet', value: 'set'}, { type: 'whiteSpace', value: ' '}, { type: 'setIdentifier', value: 'CLICK_CONSONANTS'}, { type: 'whiteSpace', value: ' '}, @@ -215,7 +229,7 @@ set NASAL_VOWELS = { [ V ] in ORAL_VOWELS yield [ Ṽ ] }, LONG_NASAL_VOWELS = { [ Vː ] in NASAL_VOWELS }`, tokens: [ { type: 'whiteSpace', value: '\n', }, - { type: 'kw-set', value: 'set', }, + { type: 'kwSet', value: 'set', }, { type: 'whiteSpace', value: ' ', }, { type: 'setIdentifier', value: 'NASAL_VOWELS', }, { type: 'whiteSpace', value: ' ', }, @@ -307,7 +321,7 @@ set SET_C = { SET_A not SET_B }, ; left anti join { type: 'whiteSpace', value: '\n', }, { type: 'comment', value: '; ---- set join operations non-mutable! ', }, { type: 'whiteSpace', value: '\n', }, - { type: 'kw-set', value: 'set', }, + { type: 'kwSet', value: 'set', }, { type: 'whiteSpace', value: ' ', }, { type: 'setIdentifier', value: 'SET_C', }, { type: 'whiteSpace', value: ' ', }, @@ -406,7 +420,7 @@ set SET_B = { [ Xy ] in SET_A }, ; FILTER: where X is any character and { type: 'whiteSpace', value: '\n', }, { type: 'comment', value: '; ---- set character operations - non-mutable!', }, { type: 'whiteSpace', value: '\n', }, - { type: 'kw-set', value: 'set', }, + { type: 'kwSet', value: 'set', }, { type: 'whiteSpace', value: ' ', }, { type: 'setIdentifier', value: 'SET_B', }, { type: 'whiteSpace', value: ' ', }, diff --git a/src/utils/latl/test/lexer.test.js b/src/utils/latl/test/lexer.test.js index d0cb9ea..aa96096 100644 --- a/src/utils/latl/test/lexer.test.js +++ b/src/utils/latl/test/lexer.test.js @@ -16,25 +16,25 @@ describe('lexer', () => { } it('lexes simple comment', () => { - lexer.reset('; comment'); - const token = lexer.next(); - expect(getToken(token)).toStrictEqual({ type: 'comment', value: '; comment'}); + const { latl, tokens } = assertionData.simpleComment; + const stream = getStream(latl); + expect(stream).toStrictEqual(tokens); }); - it('lexes simple * and identifier', () => { - lexer.reset('*proto'); - const stream = [ getToken(lexer.next()), getToken(lexer.next()) ]; - expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]); - }) + // it('lexes simple * and identifier', () => { + // lexer.reset('*proto'); + // const stream = [ getToken(lexer.next()), getToken(lexer.next()) ]; + // expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]); + // }) it('lexes set and identifier', () => { - lexer.reset('set PLOSIVES'); - const stream = [ getToken(lexer.next()), getToken(lexer.next()), getToken(lexer.next()) ]; - expect(stream).toStrictEqual([ { type: 'kw-set', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'PLOSIVES' } ]); + const { latl, tokens } = assertionData.simpleSetDefinition; + const stream = getStream(latl); + expect(stream).toStrictEqual(tokens); }) it('lexes multiple set definitions with comma operator', () => { - const { latl, tokens } = assertionData.setDefinition; + const { latl, tokens } = assertionData.commaSetDefinition; const stream = getStream(latl); expect(stream).toStrictEqual(tokens); }); diff --git a/src/utils/latl/test/parser.test.js b/src/utils/latl/test/parser.test.js new file mode 100644 index 0000000..2b36d59 --- /dev/null +++ b/src/utils/latl/test/parser.test.js @@ -0,0 +1,49 @@ +import { lexer } from '../lexer'; +import { parser } from '../parser'; +import { assertionData } from './assertionData'; + +describe('parser', () => { + it('parses simple comment', () => { + const { latl } = assertionData.simpleComment; + const AST = parser().feed(latl).results; + expect(AST.length).toBe(1); + console.log(AST[0]) + // expect(AST[0]).toStrictEqual() + }) + + // it('parses multiple set definitions with comma operator', () => { + // const { latl } = assertionData.commaSetDefinition; + // const AST = parser().feed(latl) + // console.log(AST) + // }); + + // it('lexes set definition with alias', () => { + // const { latl, tokens } = assertionData.setAliasDefinition; + // const stream = getStream(latl); + // expect(stream).toStrictEqual(tokens); + // }); + + // it('lexes set definition with set join', () => { + // const { latl, tokens } = assertionData.setDefinitionJoin; + // const stream = getStream(latl); + // expect(stream).toStrictEqual(tokens); + // }); + + // it('lexes set definition with yield operation', () => { + // const { latl, tokens } = assertionData.setDefinitionYield; + // const stream = getStream(latl); + // expect(stream).toStrictEqual(tokens); + // }); + + // it('lexes all set join operations', () => { + // const { latl, tokens } = assertionData.setOperationsJoin; + // const stream = getStream(latl); + // expect(stream).toStrictEqual(tokens); + // }); + + // it('lexes set filter, concat, and dissoc operations', () => { + // const { latl, tokens } = assertionData.setOperations; + // const stream = getStream(latl); + // expect(stream).toStrictEqual(tokens); + // }) +})