diff options
| author | 2021-12-23 17:40:59 -0500 | |
|---|---|---|
| committer | 2021-12-23 17:40:59 -0500 | |
| commit | a56756adaf54b377ba81a5db9a5a727ef4a6c44b (patch) | |
| tree | 8bb7f2ea20e6d737149cf329ef5e8b5d7baf26ca | |
| parent | Initial commit (diff) | |
initial import
| -rw-r--r-- | README.md | 257 | ||||
| -rw-r--r-- | UNLICENSE (renamed from LICENSE) | 0 | ||||
| -rw-r--r-- | patok.lua | 39 | ||||
| -rw-r--r-- | piecemeal.lua | 106 |
4 files changed, 402 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..b923887 --- /dev/null +++ b/README.md @@ -0,0 +1,257 @@ +# Patok +The Lua Pattern Tokenizer. + +Patok is an efficient tokenizer that lexes based on patterns you provide. +It does this while avoiding slow copy operations, making it reasonably efficient. +What's more is it's written in pure lua! + +## Usage + +### Constructing the lexer +Requiring patok gives back a constructor function. +You call it (often repeatedly) with lists, +where the key is the name of the pattern (which will come in handy later) +and the value is the pattern itself. +Once you're done defining lexemes, you call it again with no arguments. + +Consider the following basic example: +```lua +patok = require 'patok' +lexer = patok { + number = '%d+', + word = '%w+', +}() +``` + +Note that patterns will be tried *in order*. +However, due to how lua's lists work, lists are unordered. +This is why calling the constructor multiple times can be useful. + +Consider the following example: +```lua +lexer = patok { + foo = 'foo', + word = '%w+', +}() +``` + +This lexer may interpret "foo" as either a foo token or a word token - it is ambiguous. +We can make it unambiguous by enforcing an order, as such: +```lua +lexer = patok { + foo = 'foo', +}{ + word = '%w+', +}() +``` + +Now "foo" will always be a "foo" lexeme, and never a word. + +### Using the lexer +Once your lexer is constructed, you have two methods of interest: reset and next. +Reset will set your lexer up to lex a particular string. +Next will get the next token of the currently set string. + +Let's make a simple example, with comments demonstrating the outputs: +```lua +lexer = patok { + plus = '%+', + minus = '%-', + star = '%*', + slash = '/', + digit = '%d+', + ws = '%s+', +}() +lexer:reset '10 + 15' +lexer:next() -- {start=1, stop=2, type="digit", value="10"} +lexer:next() -- {start=3, stop=3, type="ws", value=" "} +lexer:next() -- {start=4, stop=4, type="plus", value="+"} +lexer:next() -- {start=5, stop=5, type="ws", value=" "} +lexer:next() -- {start=6, stop=7, type="digit", value="15"} +lexer:next() -- nil: feed more/different data +``` + +As per the above example, +a return value of nil means that whatever follows is not a token. +It may mean end of input, +or merely that whatever follows isn't tokenizable with the given ruleset. +Here's an example of the latter: + +```lua +lexer = patok { + a = 'a', + b = 'b', +}() +lexer.reset 'ac' +lexer:next() -- {start=1, stop=1, type='a', value='a'} +lexer:next() -- nil, even though we could still consume 'c' +``` + +### Parsing +If you just wanted a standalone lexer/tokenizer, that's all you need to know! +Most people, however, need a parser to go along with their lexer to make it useful. +Along with patok comes piecemeal: +a naive parser combinator made to work with patok. + +Note that unlike patok, piecemeal is not particularly efficient, +nor capable of streaming input. +If you have a better patok-compatible option to use, please use that instead! +If you make such a parser, feel free to contact me at <toast+git@toast.cafe>, +I will add it to this section. + +That said, +piecemeal is more than sufficient for many use-cases where lua itself is sufficient. +Please read the next section to find out how to use it. +(There will be no further information on patok itself for the rest of this file.) + +## Piecemeal +Piecemeal is the default parser for patok. +If you have access to a different parser, chances are it will work better. + +Piecemeal is a recursive descent parser combinator. +That means that you are provided with a set of parsing generating functions. +You compose and combine them into parsers, which you then compose and combine further. +The end result is a parser that parses your entire document on demand. + +### Built-Ins +Piecemeal provides the following built-in functions: +* lexeme: lexeme looks for a "type" of token produced by patok +* value: value looks for an exact match of a token's text +* all: takes a list of parsers, producing a parser for all of them in a row +* alt: takes a list of parsers, + producing a parser that looks for any one of its inputs (in order) +* opt: takes a parser and makes it optional +* plus: takes a parser and allows it to occur more than once in a row + (it's the `+` operator in regex/PEG) +* star: equivalent to optional star (it's the `*` operator in regex/PEG) +* postp: takes a parser and a function, returns a parser + whose output transforms the output of the input parser using the provided function + +Finally, piecemeal provides the "parse" function, which takes the text to parse, +the patok (or api-compatible) lexer, and the parser to parse the text with. + +This may be confusing, +so let's look through a commentated example. + +### Example Grammar +This example grammar will be able to handle mathematic expressions. +For the sake of brevity, we'll only implement addition and multiplication. +Do know that you can extend this approach to cover all of math, however. + +First, let's define our lexer. +```lua +lexer = patok { + op = '[+*]', + num = '%d+', + ws = '%s+', +} +``` + +We could have also made special lexemes for `+` and `*` individually. +However, this way, we can demonstrate both `pm.lexeme` and `pm.value`. + +Let's prepare some lexing parsers ahead of time. +```lua +lex = { + plus = pm.value '+', + star = pm.value '*', + digit = pm.postp(pm.lexeme 'digit', function (d) return tonumber(d.value) end), + space = pm.opt(pm.lexeme 'space'), +} +``` + +In that snippet, there are two things to note. +First, we made the whitespace parser optional, as our grammar does not have significant whitespace. +Secondly, we used the potentially confusing `postp` function on digit. + +Normally, a terminal parser (i.e `lexeme` and `value`) will return the bare token, as given to it by patok. +However, we generally don't want a huge layered list of tokens as the output. +Postp allows us to perform postprocessing operations on whatever data the input parser gives out. + +In this case, we know the input data will be a patok token. +We're only really interested in the actual number, though. +So we return the numeric representation of the token. +We know it already looks like a number, because of our lexer pattern. +This means that other parsers that consume our digit parser will be able to simply work with digits. + +To make this easier, we'll write a convenience function. +```lua +function findnums (d, acc) + local out = acc or {} + for _, v in ipairs(d) do + if type(v) == 'number' then + table.insert(out, v) + elseif type(v) == 'table' then + findnums(v, out) + end + end + return out +end +``` + +A few things to note here. +First, note that we iterate over ipairs. +If we iterated over pairs, we would catch the start and end index of lexer tokens. +Secondly, note that we use the fact that tables are passed by reference in lua to allow for in-line accumulation. + +Now that that's done, we can define our primary parsers. +The grammar looks something like so: +``` +expr <- add +add <- mult ('+' mult)* +mult <- digit ('*' digit)* +``` + +This makes sure that multiplication happens before addition. +We can add subtraction and multiplication in-line by using alternatives for the signs, and switching on them in the postprocessing. + +Let's implement mult first. +```lua +mult = pm.postp( + pm.all(lex.digit, pm.star(pm.all(lex.space, lex.times, lex.space, lex.digit))), + function (d) + local acc = 1 + for _, v in ipairs(findnums(d)) do + acc = acc * v + end + return acc + end) +``` + +The parser component of the postprocessor is equivalent to the grammar above. +In the postprocessing function, we take advantage of the conveninece function we wrote. +We simply multiply all of the bare digits (which we know are consumed as a part of this sub-expression) together! +Importantly, we just return a number again, since that's what we're really interested in. + +We can write add using the same method. +```lua +add = pm.postp( + pm.all(mult, pm.star(pm.all(lex.space, lex.plus, lex.space, mult))), + function (d) + local acc = 0 + for _, v in ipairs(findnums(d)) do + acc = acc + v + end + return acc + end) +``` + +Note that we can use mult here directly - it's a valid parser like any other. + +Finally, we can define expr, though it's technically optional. +```lua +expr = add +``` + +And now we can use the parser! +```lua +eind, out = pm.parse("10 + 5 * 2 + 10", lexer, expr) -- 14, 30 +``` + +### Missing +In the above sample, we did not end up using the `alt` or `plus` generators. +`alt` is related to `all`. +Where `all` requires all of its arguments to succeed in order, `alt` will try them all in order, but only one has to succeed. +`plus` is related to `star`. +With `star`, zero matches are accepted. +`plus` works the same way, except at least one match is required. diff --git a/patok.lua b/patok.lua new file mode 100644 index 0000000..f192733 --- /dev/null +++ b/patok.lua @@ -0,0 +1,39 @@ +local meta = {} +function meta:__call(p) + if p == nil then return self.out end + for k, v in pairs(p) do + table.insert(self.out.tokens, {name=k, pattern=v}) + end + return self +end + +function new (p) + local out = { + tokens = {}, + reset = function (self, input) + self.index = 1 + self.input = input + end, + next = function (self) + for _, v in ipairs(self.tokens) do + local out = {string.find(self.input, v.pattern, self.index)} + if out[1] == self.index and out[2] >= out[1] then + self.index = out[2]+1 + return { + type = v.name, + start = out[1], + stop = out[2], + value = string.sub(self.input, out[1], out[2]) + } + end + end + -- we didn't find anything + return nil -- feed more data! or better data. either one + end, + } + local constructor = {out=out} + setmetatable(constructor, meta) + return constructor(p) +end + +return new diff --git a/piecemeal.lua b/piecemeal.lua new file mode 100644 index 0000000..0f6dd6c --- /dev/null +++ b/piecemeal.lua @@ -0,0 +1,106 @@ +--[[ + A parser has the signature: (list, sindex) -> eindex [, output] + If the parser "fails", it must return exactly nil. + If the parser "succeeds", it must return at least the eindex, and may also return some output. +]]-- + +return { + lexeme = function (type) + return function (l, i) + if l[i] and l[i].type == type then + return i+1, l[i] + end + return nil + end + end, + value = function (value) + return function (l, i) + if l[i] and l[i].value == value then + return i+1, l[i] + end + return nil + end + end, + all = function (...) + local args = {...} + return function (l, i) + local res = {} + local eind, out + for _, v in ipairs(args) do + eind, out = v(l, i) + if not eind then return nil end + i = eind + table.insert(res, out) + end + return i, res + end + end, + alt = function (...) + local args = {...} + return function (l, i) + local eind, out + for _, v in ipairs(args) do + eind, out = v(l, i) + if eind then + return eind, out + end + end + return nil + end + end, + opt = function (p) + return function (l, i) + local eind, out = p(l, i) + if eind + then return eind, out + else return i + end + end + end, + plus = function (p) + return function (l, i) + local res = {} + local eind, out = p(l, i) + if not eind then return nil end + repeat + i = eind + table.insert(res, out) + eind, out = p(l, i) + until not eind + return i, res + end + end, + star = function (p) + return function (l, i) + local res = {} + local eind, out + repeat + eind, out = p(l, i) + if eind then + table.insert(res, out) + i = eind + end + until not eind + return i, res + end + end, + + postp = function (p, f) + return function (l, i) + local eind, out = p(l, i) + if out then out = f(out) end + return eind, out + end + end, + + parse = function (text, lexer, parser) + local tokens = {} + lexer:reset(text) + local next + repeat + next = lexer:next() + if next then table.insert(tokens, next) end + until next == nil + return parser(tokens, 1) + end, +} |
