add tokenizer for epoch, feature, and lexicon tokens

2020-03-01 22:42:35 -08:00 · 2020-03-01 22:42:35 -08:00 · 6bd425ac34
commit 6bd425ac34
parent d5d1eb2fa2
2 changed files with 121 additions and 2 deletions
--- a/src/reducers/reducer.latl.js
+++ b/src/reducers/reducer.latl.js
@ -9,3 +9,61 @@ export const parseLatl = (state, action) => {
  let latl = state.action;
  return { ...state }
 }
+
+const getOneToken = (latl, tokens) => {
+  for (const [type, regEx] of tokenTypes) {
+    const newRegEx = new RegExp(`^(${regEx})`);
+    const match = latl.match(newRegEx) || null;
+    if (match) {
+      const newTokens = [...tokens, match[0]]
+      const newLatl = latl.slice(match[0].length ,).trim();
+      return [newLatl, newTokens]
+    }
+  }
+  throw `Unexpected token at ${latl.split('\n')[0]}` 
+}
+
+export const tokenize = latl => {
+  let i = 0;
+  let tokens = [];
+  let newLatl = latl.trim();
+  try {
+    while(newLatl.length) {
+      [newLatl, tokens] = getOneToken(newLatl, tokens)
+    }
+    return tokens;
+  } 
+  catch (err) {
+    return {errors: 'tokenization error', message: err}
+  }
+}
+
+export const generateAST = latl => {
+  
+  // tokenize
+  const tokens = tokenize(latl);
+
+
+  // build tree
+
+}
+
+const tokenTypes = [
+[`star`, `\\*`],
+['pipe', `\\|`],
+['openBracket', `\\[`],
+['closeBracket', `\\]`],
+['positiveAssignment', `\\+=`],
+['negativeAssignment', `\\-=`],
+['plus', `\\+`],
+['minus', `\\-`],
+['greaterThan', `\\>`],
+['hash', `#`],
+['slash', `\/`],
+['dot', `\\.`],
+['loDash', `\\_`],
+[`variable`, `[A-Za-z]+`],
+['equal', `=`]
+// [`lineBreak`, `\\n`],
+// [`whiteSpace`, `\\s+`]
+]
--- a/src/reducers/reducer.latl.test.js
+++ b/src/reducers/reducer.latl.test.js
@ -1,5 +1,6 @@
 import { stateReducer } from './reducer';
 import { initState } from './reducer.init';
+import { tokenize } from './reducer.latl';

 describe('LATL', () => {
  it('returns state unaltered with no action body', () => {
@ -11,4 +12,64 @@ describe('LATL', () => {
    const returnedState = stateReducer(state, action)
    expect(returnedState).toStrictEqual(state);
  })
+
+  it('returns tokens from well-formed latl epoch definition', () => {
+    const tokens = tokenize(epochDefinitionLatl);
+    expect(tokens).toStrictEqual(tokenizedEpoch)
+  });
+  
+  it('returns tokens from well-formed latl feature definition', () => {
+    const tokens = tokenize(featureDefinitionLatl);
+    expect(tokens).toStrictEqual(tokenizedFeature);
+  })
+
+  it('returns tokens from well-formed latl lexicon definition', () => {
+    const tokens = tokenize(lexiconDefinitionLatl);
+    expect(tokens).toStrictEqual(tokenizedLexicon);
+  })
 })
+const epochDefinitionLatl = `
+*PROTO
+[+ FEATURE]>[- FEATURE]/._.
+n>m/#_.
+|CHILD
+`
+
+const tokenizedEpoch = [ 
+  '*', 'PROTO', 
+  '[', '+', 'FEATURE', ']', '>', '[', '-', 'FEATURE', ']', '/', '.', '_', '.',
+  'n', '>', 'm', '/', '#', '_', '.',
+  '|', 'CHILD'
+]
+
+const featureDefinitionLatl = `
+[+ PLOSIVE] = kp / p / b / d / t / g / k
+[- PLOSIVE] = m / n / s / z
+[SONORANT 
+  += m / n
+  -= s / z / kp / p / b / d / t / g / k
+]
+`
+
+const tokenizedFeature = [
+  '[', '+', 'PLOSIVE', ']', '=', 'kp', '/', 'p', '/', 'b', '/', 'd', '/', 't', '/', 'g', '/', 'k',
+  '[', '-', 'PLOSIVE', ']', '=', 'm', '/', 'n', '/', 's', '/', 'z',
+  '[', 'SONORANT',
+    '+=', 'm', '/', 'n',
+    '-=', 's', '/', 'z', '/', 'kp', '/', 'p', '/', 'b', '/', 'd', '/', 't', '/', 'g', '/', 'k',
+  ']'
+]
+
+const lexiconDefinitionLatl = `
+/PROTO
+  kpn
+  sm
+/
+`
+
+const tokenizedLexicon = [
+  '/', 'PROTO',
+    'kpn',
+    'sm',
+  '/'
+]