add tokenizer for epoch, feature, and lexicon tokens

This commit is contained in:
Sorrel Bri 2020-03-01 22:42:35 -08:00
parent d5d1eb2fa2
commit 6bd425ac34
2 changed files with 121 additions and 2 deletions

View file

@ -9,3 +9,61 @@ export const parseLatl = (state, action) => {
let latl = state.action; let latl = state.action;
return { ...state } return { ...state }
} }
const getOneToken = (latl, tokens) => {
for (const [type, regEx] of tokenTypes) {
const newRegEx = new RegExp(`^(${regEx})`);
const match = latl.match(newRegEx) || null;
if (match) {
const newTokens = [...tokens, match[0]]
const newLatl = latl.slice(match[0].length ,).trim();
return [newLatl, newTokens]
}
}
throw `Unexpected token at ${latl.split('\n')[0]}`
}
export const tokenize = latl => {
let i = 0;
let tokens = [];
let newLatl = latl.trim();
try {
while(newLatl.length) {
[newLatl, tokens] = getOneToken(newLatl, tokens)
}
return tokens;
}
catch (err) {
return {errors: 'tokenization error', message: err}
}
}
export const generateAST = latl => {
// tokenize
const tokens = tokenize(latl);
// build tree
}
const tokenTypes = [
[`star`, `\\*`],
['pipe', `\\|`],
['openBracket', `\\[`],
['closeBracket', `\\]`],
['positiveAssignment', `\\+=`],
['negativeAssignment', `\\-=`],
['plus', `\\+`],
['minus', `\\-`],
['greaterThan', `\\>`],
['hash', `#`],
['slash', `\/`],
['dot', `\\.`],
['loDash', `\\_`],
[`variable`, `[A-Za-z]+`],
['equal', `=`]
// [`lineBreak`, `\\n`],
// [`whiteSpace`, `\\s+`]
]

View file

@ -1,5 +1,6 @@
import { stateReducer } from './reducer'; import { stateReducer } from './reducer';
import { initState } from './reducer.init'; import { initState } from './reducer.init';
import { tokenize } from './reducer.latl';
describe('LATL', () => { describe('LATL', () => {
it('returns state unaltered with no action body', () => { it('returns state unaltered with no action body', () => {
@ -11,4 +12,64 @@ describe('LATL', () => {
const returnedState = stateReducer(state, action) const returnedState = stateReducer(state, action)
expect(returnedState).toStrictEqual(state); expect(returnedState).toStrictEqual(state);
}) })
it('returns tokens from well-formed latl epoch definition', () => {
const tokens = tokenize(epochDefinitionLatl);
expect(tokens).toStrictEqual(tokenizedEpoch)
});
it('returns tokens from well-formed latl feature definition', () => {
const tokens = tokenize(featureDefinitionLatl);
expect(tokens).toStrictEqual(tokenizedFeature);
})
it('returns tokens from well-formed latl lexicon definition', () => {
const tokens = tokenize(lexiconDefinitionLatl);
expect(tokens).toStrictEqual(tokenizedLexicon);
})
}) })
const epochDefinitionLatl = `
*PROTO
[+ FEATURE]>[- FEATURE]/._.
n>m/#_.
|CHILD
`
const tokenizedEpoch = [
'*', 'PROTO',
'[', '+', 'FEATURE', ']', '>', '[', '-', 'FEATURE', ']', '/', '.', '_', '.',
'n', '>', 'm', '/', '#', '_', '.',
'|', 'CHILD'
]
const featureDefinitionLatl = `
[+ PLOSIVE] = kp / p / b / d / t / g / k
[- PLOSIVE] = m / n / s / z
[SONORANT
+= m / n
-= s / z / kp / p / b / d / t / g / k
]
`
const tokenizedFeature = [
'[', '+', 'PLOSIVE', ']', '=', 'kp', '/', 'p', '/', 'b', '/', 'd', '/', 't', '/', 'g', '/', 'k',
'[', '-', 'PLOSIVE', ']', '=', 'm', '/', 'n', '/', 's', '/', 'z',
'[', 'SONORANT',
'+=', 'm', '/', 'n',
'-=', 's', '/', 'z', '/', 'kp', '/', 'p', '/', 'b', '/', 'd', '/', 't', '/', 'g', '/', 'k',
']'
]
const lexiconDefinitionLatl = `
/PROTO
kpn
sm
/
`
const tokenizedLexicon = [
'/', 'PROTO',
'kpn',
'sm',
'/'
]