From 2634e35a01d6efa000284796918040b6ba785a09 Mon Sep 17 00:00:00 2001 From: Sorrel Bri Date: Sat, 14 Mar 2020 22:14:31 -0700 Subject: [PATCH] define tokens for lexing set definitions, aliases, or operation, and aliases --- public/latl/ipa.latl | 75 ++++++---- public/latl/waffle.latl | 8 +- src/utils/latl/README.md | 20 ++- src/utils/latl/lexer.js | 113 +++++++++----- src/utils/latl/lexer.test.js | 23 --- src/utils/latl/test/assertionData.js | 211 +++++++++++++++++++++++++++ src/utils/latl/test/lexer.test.js | 53 +++++++ 7 files changed, 405 insertions(+), 98 deletions(-) delete mode 100644 src/utils/latl/lexer.test.js create mode 100644 src/utils/latl/test/assertionData.js create mode 100644 src/utils/latl/test/lexer.test.js diff --git a/public/latl/ipa.latl b/public/latl/ipa.latl index 05a4eb2..19d35d5 100644 --- a/public/latl/ipa.latl +++ b/public/latl/ipa.latl @@ -1,19 +1,19 @@ -set NASAL_PULMONIC_CONSONANTS = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ] - STOP_PULMONIC_CONSONANTS = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ] - S_FRICATIVE_PULMONIC_CONSONANTS = [ s, z, ʃ, ʒ, ʂ, ʐ, ɕ, ʑ ] - FRICATIVE_PULMONIC_CONSONANTS = [ ɸ, β, f, v, θ̼, ð̼, θ, ð, θ̠, ð̠, ɹ̠̊˔, ɹ̠˔, ɻ˔, ç, ʝ, x, ɣ, χ, ʁ, ħ, ʕ, h, ɦ ] - APPROXIMANT_PULMONIC_CONSONANTS = [ ʋ̥, ʋ, ɹ̥, ɹ, ɻ̊, ɻ, j̊, j, ɰ̊, ɰ, ʔ̞ ] - TAP_PULMONIC_CONSONANTS = [ ⱱ̟, ⱱ, ɾ̼, ɾ̥, ɾ, ɽ̊, ɽ, ɢ̆, ʡ̆ ] - TRILL_PULMONIC_CONSONANTS = [ ʙ̥, ʙ, r̥, r, ɽ̊r̥, ɽr, ʀ̥, ʀ, ʜ, ʢ ] - L_FRICATIVE_PULMONIC_CONSONANTS = [ ɬ, ɮ, ɭ̊˔, ɭ˔, ʎ̝̊, ʎ̝, ʟ̝̊, ʟ̝ ] - L_APPROXIMANT_PULMONIC_CONSONANTS = [ l̥, l, ɭ̊, ɭ, ʎ̥, ʎ, ʟ̥, ʟ, ʟ̠ ] - L_TAP_PULMONIC_CONSONANTS = [ ɺ, ɭ̆, ʎ̆, ʟ̆ ] - AFFRICATE_PULMONIC_CONSONANTS = [ pɸ, bβ, p̪f, b̪v, t̪θ, d̪ð, tɹ̝̊, dɹ̝, t̠ɹ̠̊˔, d̠ɹ̠˔, cç, ɟʝ, kx, ɡɣ, qχ, ʡʢ, ʔh ] - S_AFFRICATE_PULMONIC_CONSONANTS = [ ts, dz, t̠ʃ, d̠ʒ, ʈʂ, ɖʐ, tɕ, dʑ ] - L_AFFRICATE_PULMONIC_CONSONANTS = [ tɬ, dɮ, ʈɭ̊˔, cʎ̝̊, kʟ̝̊, ɡʟ̝ ] - DOUBLE_STOP_PULMONIC_CONSONANTS = [ t͡p, d͡b, k͡p, ɡ͡b, q͡ʡ ] - DOUBLE_NASAL_PULMONIC_CONSONANTS = [ n͡m, ŋ͡m ] - DOUBLE_FRICATIVE_PULMONIC_CONSONANTS = [ ɧ ] +set NASAL_PULMONIC_CONSONANTS = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ], + STOP_PULMONIC_CONSONANTS = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ], + S_FRICATIVE_PULMONIC_CONSONANTS = [ s, z, ʃ, ʒ, ʂ, ʐ, ɕ, ʑ ], + FRICATIVE_PULMONIC_CONSONANTS = [ ɸ, β, f, v, θ̼, ð̼, θ, ð, θ̠, ð̠, ɹ̠̊˔, ɹ̠˔, ɻ˔, ç, ʝ, x, ɣ, χ, ʁ, ħ, ʕ, h, ɦ ], + APPROXIMANT_PULMONIC_CONSONANTS = [ ʋ̥, ʋ, ɹ̥, ɹ, ɻ̊, ɻ, j̊, j, ɰ̊, ɰ, ʔ̞ ], + TAP_PULMONIC_CONSONANTS = [ ⱱ̟, ⱱ, ɾ̼, ɾ̥, ɾ, ɽ̊, ɽ, ɢ̆, ʡ̆ ], + TRILL_PULMONIC_CONSONANTS = [ ʙ̥, ʙ, r̥, r, ɽ̊r̥, ɽr, ʀ̥, ʀ, ʜ, ʢ ], + L_FRICATIVE_PULMONIC_CONSONANTS = [ ɬ, ɮ, ɭ̊˔, ɭ˔, ʎ̝̊, ʎ̝, ʟ̝̊, ʟ̝ ], + L_APPROXIMANT_PULMONIC_CONSONANTS = [ l̥, l, ɭ̊, ɭ, ʎ̥, ʎ, ʟ̥, ʟ, ʟ̠ ], + L_TAP_PULMONIC_CONSONANTS = [ ɺ, ɭ̆, ʎ̆, ʟ̆ ], + AFFRICATE_PULMONIC_CONSONANTS = [ pɸ, bβ, p̪f, b̪v, t̪θ, d̪ð, tɹ̝̊, dɹ̝, t̠ɹ̠̊˔, d̠ɹ̠˔, cç, ɟʝ, kx, ɡɣ, qχ, ʡʢ, ʔh ], + S_AFFRICATE_PULMONIC_CONSONANTS = [ ts, dz, t̠ʃ, d̠ʒ, ʈʂ, ɖʐ, tɕ, dʑ ], + L_AFFRICATE_PULMONIC_CONSONANTS = [ tɬ, dɮ, ʈɭ̊˔, cʎ̝̊, kʟ̝̊, ɡʟ̝ ], + DOUBLE_STOP_PULMONIC_CONSONANTS = [ t͡p, d͡b, k͡p, ɡ͡b, q͡ʡ ], + DOUBLE_NASAL_PULMONIC_CONSONANTS = [ n͡m, ŋ͡m ], + DOUBLE_FRICATIVE_PULMONIC_CONSONANTS = [ ɧ ], DOUBLE_APPROXIMANT_PULMONIC_CONSONANTS = [ ʍ, w, ɥ̊, ɥ, ɫ ] set PULMONIC_CONSONANTS, C = { NASAL_PULMONIC_CONSONANTS or STOP_PULMONIC_CONSONANTS @@ -28,10 +28,10 @@ set PULMONIC_CONSONANTS, C = { NASAL_PULMONIC_CONSONANTS or STOP_PULM } -set STOP_EJECTIVE_CONSONANTS = [ pʼ, tʼ, ʈʼ, cʼ, kʼ, qʼ, ʡʼ ] - FRICATIVE_EJECTIVE_CONSONANTS = [ ɸʼ, fʼ, θʼ, sʼ, ʃʼ, ʂʼ, ɕʼ, xʼ, χʼ ] - L_FRICATIVE_EJECTIVE_CONSONANTS = [ ɬʼ ] - AFFRICATE_EJECTIVE_CONSONANTS = [ tsʼ, t̠ʃʼ, ʈʂʼ, kxʼ, qχʼ ] +set STOP_EJECTIVE_CONSONANTS = [ pʼ, tʼ, ʈʼ, cʼ, kʼ, qʼ, ʡʼ ], + FRICATIVE_EJECTIVE_CONSONANTS = [ ɸʼ, fʼ, θʼ, sʼ, ʃʼ, ʂʼ, ɕʼ, xʼ, χʼ ], + L_FRICATIVE_EJECTIVE_CONSONANTS = [ ɬʼ ], + AFFRICATE_EJECTIVE_CONSONANTS = [ tsʼ, t̠ʃʼ, ʈʂʼ, kxʼ, qχʼ ], L_AFFRICATE_EJECTIVE_CONSONANTS = [ tɬʼ, cʎ̝̊ʼ, kʟ̝̊ʼ ] set EJECTIVE_CONSONANTS = { STOP_EJECTIVE_CONSONANTS or FRICATIVE_EJECTIVE_CONSONANTS @@ -39,9 +39,9 @@ set EJECTIVE_CONSONANTS = { STOP_EJECTIVE_CONSONANTS or FRICATIVE_ or L_AFFRICATE_EJECTIVE_CONSONANTS } -set TENUIS_CLICK_CONSONANTS = [ ʘ, ǀ, ǃ, ǂ ] - VOICED_CLICK_CONSONANTS = [ ʘ̬, ǀ̬, ǃ̬, ǂ̬ ] - NASAL_CLICK_CONSONANTS = [ ʘ̃, ǀ̃, ǃ̃, ǂ̃ ] +set TENUIS_CLICK_CONSONANTS = [ ʘ, ǀ, ǃ, ǂ ], + VOICED_CLICK_CONSONANTS = [ ʘ̬, ǀ̬, ǃ̬, ǂ̬ ], + NASAL_CLICK_CONSONANTS = [ ʘ̃, ǀ̃, ǃ̃, ǂ̃ ], L_CLICK_CONSONANTS = [ ǁ, ǁ̬ ] set CLICK_CONSONANTS = { TENUIS_CLICK_CONSONANTS or VOICED_CLICK_CONSONANTS @@ -52,25 +52,36 @@ set IMPLOSIVE_CONSONANTS = [ ɓ, ɗ, ᶑ, ʄ, ɠ, ʛ, ɓ̥, ɗ̥, set NON_PULMONIC_CONSONANTS = { EJECTIVE_CONSONANTS or CLICK_CONSONANTS or IMPLOSIVE_CONSONANTS } -set IMPLOSIVE_CONSONANTS = { PULMONIC_CONSONANTS or NON_PULMONIC_CONSONANTS } +set CONSONANTS = { PULMONIC_CONSONANTS or NON_PULMONIC_CONSONANTS } -set MODAL_VOWELS = [ i, y, ɨ, ʉ, ɯ, u, ɪ, ʏ, ʊ, e, ø ɘ, ɵ ɤ, o, ø̞ ə, o̞, ɛ, œ ɜ, ɞ ʌ, ɔ, æ, ɐ, a, ɶ, ä, ɑ, ɒ ] - BREATHY_VOWELS = { [ V ] in MODAL_VOWELS yield [ V̤ ] } - VOICELESS_VOWELS = { [ V ] in MODAL_VOWELS yield [ V̥ ] } +set MODAL_VOWELS = [ i, y, ɨ, ʉ, ɯ, u, ɪ, ʏ, ʊ, e, ø ɘ, ɵ ɤ, o, ø̞ ə, o̞, ɛ, œ ɜ, ɞ ʌ, ɔ, æ, ɐ, a, ɶ, ä, ɑ, ɒ ], + BREATHY_VOWELS = { [ V ] in MODAL_VOWELS yield [ V̤ ] }, + VOICELESS_VOWELS = { [ V ] in MODAL_VOWELS yield [ V̥ ] }, CREAKY_VOWELS = { [ V ] in MODAL_VOWELS yield [ V̰ ] } -set SHORT_ORAL_VOWELS = { MODAL_VOWELS or BREATHY_VOWELS or CREAKY_VOWELS or VOICELESS_VOWELS } - LONG_ORAL_VOWELS = { [ V ] in SHORT_ORAL_VOWELS [ Vː ] } +set SHORT_ORAL_VOWELS = { MODAL_VOWELS or BREATHY_VOWELS or CREAKY_VOWELS or VOICELESS_VOWELS }, + LONG_ORAL_VOWELS = { [ V ] in SHORT_ORAL_VOWELS [ Vː ] }, ORAL_VOWELS = { SHORT_ORAL_VOWELS or LONG_ORAL_VOWELS } -set NASAL_VOWELS = { [ V ] in ORAL_VOWELS yield [ Ṽ ] } - SHORT_NASAL_VOWELS = { [ Vː ] in NASAL_VOWELS yield [ V ]ː } +set NASAL_VOWELS = { [ V ] in ORAL_VOWELS yield [ Ṽ ] }, + SHORT_NASAL_VOWELS = { [ Vː ] in NASAL_VOWELS yield [ V ]ː }, LONG_NASAL_VOWELS = { [ Vː ] in NASAL_VOWELS } set VOWELS = { ORAL_VOWELS or NASAL_VOWELS } -print { GLOBAL } +set PHONES = { VOWELS or CONSONANTS } +print [ GLOBAL ] + +[lateral + += + L_AFFRICATE_EJECTIVE_CONSONANTS, L_AFFRICATE_PULMONIC_CONSONANTS, L_APPROXIMANT_PULMONIC_CONSONANTS, + L_CLICK_CONSONANTS, L_FRICATIVE_EJECTIVE_CONSONANTS, L_FRICATIVE_PULMONIC_CONSONANTS, L_TAP_PULMONIC_CONSONANTS + -= + { not { [+ lateral ] in CONSONANTS } }, VOWELS + ; alternative + ; { not { [+ lateral ] in PHONES } } +] *proto-lang diff --git a/public/latl/waffle.latl b/public/latl/waffle.latl index 5cf180b..281ad3e 100644 --- a/public/latl/waffle.latl +++ b/public/latl/waffle.latl @@ -15,7 +15,8 @@ ; -- -TENSE = æ / ə / ɪ̞ / ɛ / ʌ / ʊ̞ / ɔ ; ---- DIPHTHONGS = eə / eɪ̯ / ju̟ / äɪ̞ / ɔɪ̞ / oʊ̞ / aʊ̞ / ɑɹ / iɹ / ɛɹ / ɔɹ / ʊɹ -; ---- CONSONANTS = p (pʰ) / b (b̥) / t (tʰ)(ɾ)(ʔ) / d (d̥)(ɾ) / tʃ / dʒ (d̥ʒ̊) / k (kʰ) / g (g̊) / f / v (v̥) / θ / ð (ð̥) / s / z (z̥) / ʃ / ʒ (ʒ̊) / h (ɦ)(ç) / m (ɱ)(m̩) / n(n̩) / ŋ / l (l̩)/ ɹ (ɹʲ ~ ɹˤ)(ɹ̩) / w (w̥) / j / x / ʔ +; ---- CONSONANTS = p (pʰ) / b (b̥) / t (tʰ)(ɾ)(ʔ) / d (d̥)(ɾ) / tʃ / dʒ (d̥ʒ̊) / k (kʰ) / g (g̊) / f / v (v̥) / θ / ð (ð̥) / +; s / z (z̥) / ʃ / ʒ (ʒ̊) / h (ɦ)(ç) / m (ɱ)(m̩) / n(n̩) / ŋ / l (l̩)/ ɹ (ɹʲ ~ ɹˤ)(ɹ̩) / w (w̥) / j / x / ʔ ; -- PLOSIVES = p / p' / pʰ / t / t' / tʰ ɾ / k / k' / kʰ ; -- AFFRICATES = tʃ / dʒ ; -- FRICATIVES = f / v / θ / ð / s / z / ʃ / ʒ / ç / x @@ -46,7 +47,8 @@ set PLOSIVES [ p, pʰ, t, tʼ, tʰ, ɾ, kʼ, k, kʰ ] ; { SET_A not SET_B } left anti join ; { SET_A and SET_B } inner join ; { SET_A or SET_B } full outer join -; { SET_A nor SET_B } = { GLOBAL not { SET_A and SET_B } } +; { not SET_A } = { GLOBAL not SET_A } +; { not SET_A nor SET_B } = { GLOBAL not { SET_A or SET_B } } ; ---- set character operations - non-mutable! ; { [ Xy ] in SET_A } FILTER: where X is any character and y is a filtering character @@ -125,7 +127,7 @@ set PLOSIVES [ p, pʰ, t, tʼ, tʰ, ɾ, kʼ, k, kʰ ] ; ASPIRATED PLOSIVES pʰ, tʰ, kʰ, ; ASPIRATED AFFRICATES - , + ; SPREAD LARYNGEALS h ɦ -= diff --git a/src/utils/latl/README.md b/src/utils/latl/README.md index bab3da2..865a1bf 100644 --- a/src/utils/latl/README.md +++ b/src/utils/latl/README.md @@ -30,11 +30,29 @@ A -> B / . _ . ; environment indicated with underscore and placeholder dots ## Language Primitives ## Data Structures ### Sets +Sets are collections of pointers to phones. The GLOBAL set contains all phones, making all other sets subsets of GLOBAL. +#### Global Set +[ GLOBAL ] is a shorthand for [ GLOBAL.SETS ] #### Set Definition #### Set Usage -#### Set Operation +#### Set Operations +##### 'and' Operation +##### 'or' Operation +##### 'not' Operation +##### 'nor' Operation +##### 'in' Operation +##### 'yield' Operation ### Lexemes #### Lexeme Operations ### Phone +For set of phones 'a', 'b', and 'ab': +``` +GLOBAL ┬▻ ┬▻ ┬▻ { feature: , ... } + │ │ └▻ grapheme: + │ └┬▻ { feature: , ... } + │ └▻ grapheme: + └┬▻ { feature: , ... } + └▻ grapheme: +``` #### Phone Operations ### Epochs \ No newline at end of file diff --git a/src/utils/latl/lexer.js b/src/utils/latl/lexer.js index 6b88d2f..dd48a9c 100644 --- a/src/utils/latl/lexer.js +++ b/src/utils/latl/lexer.js @@ -2,19 +2,24 @@ const moo = require('moo'); export const lexer = moo.states({ main: { - comment: /;.*/, - epochParent: { match: /\*/, push: 'epoch' }, + comment: /;.*$/, + star: { match: /\*/, push: 'epoch' }, slash: { match: /\//, push: 'lexicon' }, // change so that identifiers are always upper, keywords are always lower, phones are always lower - identifier: { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, type: moo.keywords({ - 'kw-set': { match: 'set', push: 'setDefinition' } - })}, + 'kw-set': { match: 'set', type: moo.keywords({ 'kw-set': 'set '}), push: 'setDefinition'}, + identifier: { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, }, + // type: moo.keywords({ + // 'kw-set': 'set' + // // { match: 'set', push: 'setDefinition' }, + // })}, openBracket: { match: /\[/, push: 'feature' }, - space: { match: /\s+/, lineBreaks: true } + whiteSpace: { match: /\s+/, lineBreaks: true }, + newLine: { match: /\n+/, lineBreaks: true } }, epoch: { identifier: { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/, push: 'rule' }, + openParen: { match: /\(/, push: 'ruleDefinition' }, pipe: { match: /\|/, pop: true }, greaterThan: /\>/, arrow: /\-\>/, @@ -22,55 +27,85 @@ export const lexer = moo.states({ slash: /\//, dot: /\./, underscore: /\_/, + newLine: { match: /\n/, lineBreaks: true } + }, + + ruleDefinition: { + doubleTick: { match: /``/, push: 'ruleName' }, + singleTick: { match: /`/, push: 'ruleDescription' }, + // push rule + closeParen: { match: /\)/, pop: true }, + newLine: { match: /\n/, lineBreaks: true } }, + ruleName: { + ruleName: { match: /.+(?=``)/ }, + doubleTick: { match: /``/, pop: true } + }, + + ruleDescription: { + ruleDescription: { match: /.+(?=`)/ }, + singleTick: { match: /`/, pop: true } + }, + rule: { openSquareBracket: { match: /\[/, push: 'ruleFeature' }, - + // whiteSpace: { match: /\s/ }, + newLine: { match: /\n/, pop: true, lineBreaks: true } }, - + ruleFeature: { ruleFeature: { match: /[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*/ }, - closeBracket: { match: /\]/, pop: true } + closeBracket: { match: /\]/, pop: true }, + newLine: { match: /\n/, lineBreaks: true } }, - + lexicon: { slash: { match: /\//, pop: true }, + newLine: { match: /\n/, lineBreaks: true } }, - + feature: { closeBracket: { match: /\]/, pop: true }, positiveAssignment: /\+=/, negativeAssignment: /\-=/, + newLine: { match: /\n/, lineBreaks: true } }, - + setDefinition: { - openCurlyBracket: /\{/, - closeCurlyBracket: /\}/, + setIdentifier: { match: /[A-Z]+[A-Z_]*/ }, + openCurlyBracket: { match: /\{/, push: 'setOperation' }, + equal: /=/, openSquareBracket: /\[/, - closeSquareBracket: /\]/ + phone: /[\u00c0-\u03FFa-z]+/, + closeSquareBracket: { match: /\]/ }, + comma: { match: /,/, push: 'commaOperation' }, + whiteSpace: { match: /[\t ]+/ }, + newLine: { match: /\n/, pop: true, lineBreaks: true }, + }, + + setOperation: { + closeCurlyBracket: { match: /\}/, pop: true }, + // ! restrict identifiers + keyword: { match: ['not', 'and', 'or', 'nor', 'in', 'yield'], type: moo.keywords({ + 'kw-set-not': 'not' , + 'kw-set-and': 'and' , + 'kw-set-or': 'or' , + 'kw-set-nor': 'nor' , + 'kw-set-in': 'in' , + 'kw-set-yield': 'yield' , + }) + }, + identifier: /[A-Z]+[A-Z_]+/, + whiteSpace: /[\t ]+/, + newLine: { match: /\n/, lineBreaks: true } + }, + + commaOperation: { + // if comma is detected during a definition, the commaOperation consumes all white space and pops back to definition + // this prevents popping back to main + whiteSpace: { match: /\s+/, lineBreaks: true, pop: true }, + newLine: { match: /\n/, lineBreaks: true, pop: true } } -}); - -// ['semicolon', ';.*\n'], -// [`star`, `\\*`], - -// ['pipe', `\\|`], -// ['openBracket', `\\[`], -// ['closeBracket', `\\]`], -// ['positiveAssignment', `\\+=`], -// ['negativeAssignment', `\\-=`], -// ['plus', `\\+`], -// ['minus', `\\-`], -// ['greaterThan', `\\>`], -// ['hash', `#`], -// ['slash', `\/`], -// ['dot', `\\.`], -// ['underscore', `\\_`], - -// [`identifier`, `[A-Za-z]+[\u00c0-\u03FFA-Za-z0-9\\-\\_]*`], - -// [`phone`, `[\u00c0-\u03FFA-Za-z0]+`], -// ['equal', `=`], -// [`lineBreak`, `\\n`], -// [`whiteSpace`, `\\s+`] \ No newline at end of file + +}); \ No newline at end of file diff --git a/src/utils/latl/lexer.test.js b/src/utils/latl/lexer.test.js deleted file mode 100644 index 55cb014..0000000 --- a/src/utils/latl/lexer.test.js +++ /dev/null @@ -1,23 +0,0 @@ -import { lexer } from './lexer'; - -describe('lexer', () => { - const extractToken = obj => ({ type: obj.type, value: obj.value }); - - it('lexes simple comment', () => { - lexer.reset('; comment'); - const token = lexer.next(); - expect(extractToken(token)).toStrictEqual({ type: 'comment', value: '; comment'}); - }); - - it('lexes simple * and identifier', () => { - lexer.reset('*proto'); - const stream = [ extractToken(lexer.next()), extractToken(lexer.next()) ]; - expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]); - }) - - it('lexes set and identifier', () => { - lexer.reset('set PLOSIVES'); - const stream = [ extractToken(lexer.next()), extractToken(lexer.next()), extractToken(lexer.next()) ]; - expect(stream).toStrictEqual([ { type: 'kw-set', value: 'set' }, { type: 'space', value: ' ' }, { type: 'identifier', value: 'PLOSIVES' } ]); - }) -}) \ No newline at end of file diff --git a/src/utils/latl/test/assertionData.js b/src/utils/latl/test/assertionData.js new file mode 100644 index 0000000..641e307 --- /dev/null +++ b/src/utils/latl/test/assertionData.js @@ -0,0 +1,211 @@ +export const assertionData = { + setDefinition: { + latl: ` +set NASAL_PULMONIC_CONSONANTS = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ], + STOP_PULMONIC_CONSONANTS = [ p, b, p̪, b̪, t̼, d̼, t, d, ʈ, ɖ, c, ɟ, k, ɡ, q, ɢ, ʡ, ʔ ]`, + tokens: [ + { type: 'whiteSpace', value: '\n' }, + { type: 'kw-set', value: 'set' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'equal', value: '=' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'openSquareBracket', value: '[' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'm̥' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'm' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɱ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n̼' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n̥' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɳ̊' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɳ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɲ̊' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɲ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ŋ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: '̊ŋ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɴ' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'closeSquareBracket', value: ']' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: '\n ' }, + { type: 'setIdentifier', value: 'STOP_PULMONIC_CONSONANTS' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'equal', value: '=' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'openSquareBracket', value: '[' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'p' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'b' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'p̪' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'b̪' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 't̼' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'd̼' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 't' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'd' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ʈ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɖ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'c' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɟ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'k' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɡ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'q' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɢ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ʡ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ʔ' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'closeSquareBracket', value: ']' } + ] + }, + setAliasDefinition: { + latl: ` +set NASAL_PULMONIC_CONSONANTS, N = [ m̥, m, ɱ, n̼, n̥, n, ɳ̊, ɳ, ɲ̊, ɲ, ŋ, ̊ŋ, ɴ ]`, + tokens: [ + { type: 'whiteSpace', value: '\n' }, + { type: 'kw-set', value: 'set' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'setIdentifier', value: 'NASAL_PULMONIC_CONSONANTS' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'setIdentifier', value: 'N' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'equal', value: '=' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'openSquareBracket', value: '[' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'm̥' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'm' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɱ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n̼' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n̥' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'n' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɳ̊' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɳ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɲ̊' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɲ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ŋ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: '̊ŋ' }, + { type: 'comma', value: ',' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'phone', value: 'ɴ' }, + { type: 'whiteSpace', value: ' ' }, + { type: 'closeSquareBracket', value: ']' }, + ] + }, + setDefinitionJoin: { + latl: ` +set CLICK_CONSONANTS = { TENUIS_CLICK_CONSONANTS or VOICED_CLICK_CONSONANTS + or NASAL_CLICK_CONSONANTS or L_CLICK_CONSONANTS + }`, + tokens: [ + { type: 'whiteSpace', value: '\n'}, + { type: 'kw-set', value: 'set'}, + { type: 'whiteSpace', value: ' '}, + { type: 'setIdentifier', value: 'CLICK_CONSONANTS'}, + { type: 'whiteSpace', value: ' '}, + { type: 'equal', value: '='}, + { type: 'whiteSpace', value: ' '}, + { type: 'openCurlyBracket', value: '{'}, + { type: 'whiteSpace', value: ' '}, + { type: 'identifier', value: 'TENUIS_CLICK_CONSONANTS'}, + { type: 'whiteSpace', value: ' '}, + { type: 'kw-set-or', value: 'or'}, + { type: 'whiteSpace', value: ' '}, + { type: 'identifier', value: 'VOICED_CLICK_CONSONANTS'}, + { type: 'newLine', value: '\n'}, + { type: 'whiteSpace', value: ' '}, + { type: 'kw-set-or', value: 'or'}, + { type: 'whiteSpace', value: ' '}, + { type: 'identifier', value: 'NASAL_CLICK_CONSONANTS'}, + { type: 'whiteSpace', value: ' '}, + { type: 'kw-set-or', value: 'or'}, + { type: 'whiteSpace', value: ' '}, + { type: 'identifier', value: 'L_CLICK_CONSONANTS'}, + { type: 'whiteSpace', value: ' '}, + { type: 'newLine', value: '\n'}, + { type: 'whiteSpace', value: ' '}, + { type: 'closeCurlyBracket', value: '}'} + ] + }, +} \ No newline at end of file diff --git a/src/utils/latl/test/lexer.test.js b/src/utils/latl/test/lexer.test.js new file mode 100644 index 0000000..0945da4 --- /dev/null +++ b/src/utils/latl/test/lexer.test.js @@ -0,0 +1,53 @@ +import { lexer } from '../lexer'; +import { assertionData } from './assertionData'; + +describe('lexer', () => { + const getToken = obj => obj ? formatToken(obj) : null; + const formatToken = obj => ({ type: obj.type, value: obj.value }); + const getStream = latl => { + lexer.reset(latl); + let token = getToken(lexer.next()); + let stream = []; + do { + stream = [...stream, token] + token = getToken(lexer.next()); + } while (token); + return stream; + } + + it('lexes simple comment', () => { + lexer.reset('; comment'); + const token = lexer.next(); + expect(getToken(token)).toStrictEqual({ type: 'comment', value: '; comment'}); + }); + + it('lexes simple * and identifier', () => { + lexer.reset('*proto'); + const stream = [ getToken(lexer.next()), getToken(lexer.next()) ]; + expect(stream).toStrictEqual([ { type: 'star', value: '*' }, { type: 'identifier', value: 'proto' } ]); + }) + + it('lexes set and identifier', () => { + lexer.reset('set PLOSIVES'); + const stream = [ getToken(lexer.next()), getToken(lexer.next()), getToken(lexer.next()) ]; + expect(stream).toStrictEqual([ { type: 'kw-set', value: 'set' }, { type: 'whiteSpace', value: ' ' }, { type: 'setIdentifier', value: 'PLOSIVES' } ]); + }) + + it('lexes multiple set definitions with comma operator', () => { + const { latl, tokens } = assertionData.setDefinition; + const stream = getStream(latl); + expect(stream).toStrictEqual(tokens); + }); + + it('lexes set definition with alias', () => { + const { latl, tokens } = assertionData.setAliasDefinition; + const stream = getStream(latl); + expect(stream).toStrictEqual(tokens); + }); + + it('lexes set definition with set join', () => { + const { latl, tokens } = assertionData.setDefinitionJoin; + const stream = getStream(latl); + expect(stream).toStrictEqual(tokens); + }) +}) \ No newline at end of file