4 files changed, 402 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b923887
--- /dev/null
+++ b/README.md
@@ -0,0 +1,257 @@
+# Patok
+The Lua Pattern Tokenizer.
+
+Patok is an efficient tokenizer that lexes based on patterns you provide.
+It does this while avoiding slow copy operations, making it reasonably efficient.
+What's more is it's written in pure lua!
+
+## Usage
+
+### Constructing the lexer
+Requiring patok gives back a constructor function.
+You call it (often repeatedly) with lists,
+where the key is the name of the pattern (which will come in handy later)
+and the value is the pattern itself.
+Once you're done defining lexemes, you call it again with no arguments.
+
+Consider the following basic example:
+```lua
+patok = require 'patok'
+lexer = patok {
+	number = '%d+',
+	word   = '%w+',
+}()
+```
+
+Note that patterns will be tried *in order*.
+However, due to how lua's lists work, lists are unordered.
+This is why calling the constructor multiple times can be useful.
+
+Consider the following example:
+```lua
+lexer = patok {
+	foo = 'foo',
+	word = '%w+',
+}()
+```
+
+This lexer may interpret "foo" as either a foo token or a word token - it is ambiguous.
+We can make it unambiguous by enforcing an order, as such:
+```lua
+lexer = patok {
+	foo = 'foo',
+}{
+	word = '%w+',
+}()
+```
+
+Now "foo" will always be a "foo" lexeme, and never a word.
+
+### Using the lexer
+Once your lexer is constructed, you have two methods of interest: reset and next.
+Reset will set your lexer up to lex a particular string.
+Next will get the next token of the currently set string.
+
+Let's make a simple example, with comments demonstrating the outputs:
+```lua
+lexer = patok {
+	plus  = '%+',
+	minus = '%-',
+	star  = '%*',
+	slash = '/',
+	digit = '%d+',
+	ws    = '%s+',
+}()
+lexer:reset '10 + 15'
+lexer:next() -- {start=1, stop=2, type="digit", value="10"}
+lexer:next() -- {start=3, stop=3, type="ws", value=" "}
+lexer:next() -- {start=4, stop=4, type="plus", value="+"}
+lexer:next() -- {start=5, stop=5, type="ws", value=" "}
+lexer:next() -- {start=6, stop=7, type="digit", value="15"}
+lexer:next() -- nil: feed more/different data
+```
+
+As per the above example,
+a return value of nil means that whatever follows is not a token.
+It may mean end of input,
+or merely that whatever follows isn't tokenizable with the given ruleset.
+Here's an example of the latter:
+
+```lua
+lexer = patok {
+	a = 'a',
+	b = 'b',
+}()
+lexer.reset 'ac'
+lexer:next() -- {start=1, stop=1, type='a', value='a'}
+lexer:next() -- nil, even though we could still consume 'c'
+```
+
+### Parsing
+If you just wanted a standalone lexer/tokenizer, that's all you need to know!
+Most people, however, need a parser to go along with their lexer to make it useful.
+Along with patok comes piecemeal:
+a naive parser combinator made to work with patok.
+
+Note that unlike patok, piecemeal is not particularly efficient,
+nor capable of streaming input.
+If you have a better patok-compatible option to use, please use that instead!
+If you make such a parser, feel free to contact me at <toast+git@toast.cafe>,
+I will add it to this section.
+
+That said,
+piecemeal is more than sufficient for many use-cases where lua itself is sufficient.
+Please read the next section to find out how to use it.
+(There will be no further information on patok itself for the rest of this file.)
+
+## Piecemeal
+Piecemeal is the default parser for patok.
+If you have access to a different parser, chances are it will work better.
+
+Piecemeal is a recursive descent parser combinator.
+That means that you are provided with a set of parsing generating functions.
+You compose and combine them into parsers, which you then compose and combine further.
+The end result is a parser that parses your entire document on demand.
+
+### Built-Ins
+Piecemeal provides the following built-in functions:
+* lexeme: lexeme looks for a "type" of token produced by patok
+* value: value looks for an exact match of a token's text
+* all: takes a list of parsers, producing a parser for all of them in a row
+* alt: takes a list of parsers,
+  producing a parser that looks for any one of its inputs (in order)
+* opt: takes a parser and makes it optional
+* plus: takes a parser and allows it to occur more than once in a row
+  (it's the `+` operator in regex/PEG)
+* star: equivalent to optional star (it's the `*` operator in regex/PEG)
+* postp: takes a parser and a function, returns a parser
+  whose output transforms the output of the input parser using the provided function
+
+Finally, piecemeal provides the "parse" function, which takes the text to parse,
+the patok (or api-compatible) lexer, and the parser to parse the text with.
+
+This may be confusing,
+so let's look through a commentated example.
+
+### Example Grammar
+This example grammar will be able to handle mathematic expressions.
+For the sake of brevity, we'll only implement addition and multiplication.
+Do know that you can extend this approach to cover all of math, however.
+
+First, let's define our lexer.
+```lua
+lexer = patok {
+	op  = '[+*]',
+	num = '%d+',
+	ws  = '%s+',
+}
+```
+
+We could have also made special lexemes for `+` and `*` individually.
+However, this way, we can demonstrate both `pm.lexeme` and `pm.value`.
+
+Let's prepare some lexing parsers ahead of time.
+```lua
+lex = {
+	plus  = pm.value '+',
+	star  = pm.value '*',
+	digit = pm.postp(pm.lexeme 'digit', function (d) return tonumber(d.value) end),
+	space = pm.opt(pm.lexeme 'space'),
+}
+```
+
+In that snippet, there are two things to note.
+First, we made the whitespace parser optional, as our grammar does not have significant whitespace.
+Secondly, we used the potentially confusing `postp` function on digit.
+
+Normally, a terminal parser (i.e `lexeme` and `value`) will return the bare token, as given to it by patok.
+However, we generally don't want a huge layered list of tokens as the output.
+Postp allows us to perform postprocessing operations on whatever data the input parser gives out.
+
+In this case, we know the input data will be a patok token.
+We're only really interested in the actual number, though.
+So we return the numeric representation of the token.
+We know it already looks like a number, because of our lexer pattern.
+This means that other parsers that consume our digit parser will be able to simply work with digits.
+
+To make this easier, we'll write a convenience function.
+```lua
+function findnums (d, acc)
+	local out = acc or {}
+	for _, v in ipairs(d) do
+		if type(v) == 'number' then
+			table.insert(out, v)
+		elseif type(v) == 'table' then
+			findnums(v, out)
+		end
+	end
+	return out
+end
+```
+
+A few things to note here.
+First, note that we iterate over ipairs.
+If we iterated over pairs, we would catch the start and end index of lexer tokens.
+Secondly, note that we use the fact that tables are passed by reference in lua to allow for in-line accumulation.
+
+Now that that's done, we can define our primary parsers.
+The grammar looks something like so:
+```
+expr <- add
+add <- mult ('+' mult)*
+mult <- digit ('*' digit)*
+```
+
+This makes sure that multiplication happens before addition.
+We can add subtraction and multiplication in-line by using alternatives for the signs, and switching on them in the postprocessing.
+
+Let's implement mult first.
+```lua
+mult = pm.postp(
+	pm.all(lex.digit, pm.star(pm.all(lex.space, lex.times, lex.space, lex.digit))),
+	function (d)
+		local acc = 1
+		for _, v in ipairs(findnums(d)) do
+			acc = acc * v
+		end
+		return acc
+	end)
+```
+
+The parser component of the postprocessor is equivalent to the grammar above.
+In the postprocessing function, we take advantage of the conveninece function we wrote.
+We simply multiply all of the bare digits (which we know are consumed as a part of this sub-expression) together!
+Importantly, we just return a number again, since that's what we're really interested in.
+
+We can write add using the same method.
+```lua
+add = pm.postp(
+	pm.all(mult, pm.star(pm.all(lex.space, lex.plus, lex.space, mult))),
+	function (d)
+		local acc = 0
+		for _, v in ipairs(findnums(d)) do
+			acc = acc + v
+		end
+		return acc
+	end)
+```
+
+Note that we can use mult here directly - it's a valid parser like any other.
+
+Finally, we can define expr, though it's technically optional.
+```lua
+expr = add
+```
+
+And now we can use the parser!
+```lua
+eind, out = pm.parse("10 + 5 * 2 + 10", lexer, expr) -- 14, 30
+```
+
+### Missing
+In the above sample, we did not end up using the `alt` or `plus` generators.
+`alt` is related to `all`.
+Where `all` requires all of its arguments to succeed in order, `alt` will try them all in order, but only one has to succeed.
+`plus` is related to `star`.
+With `star`, zero matches are accepted.
+`plus` works the same way, except at least one match is required.
diff --git a/LICENSE b/UNLICENSE
index fdddb29..fdddb29 100644
--- a/LICENSE
+++ b/UNLICENSE
diff --git a/patok.lua b/patok.lua
new file mode 100644
index 0000000..f192733
--- /dev/null
+++ b/patok.lua
@@ -0,0 +1,39 @@
+local meta = {}
+function meta:__call(p)
+	if p == nil then return self.out end
+	for k, v in pairs(p) do
+		table.insert(self.out.tokens, {name=k, pattern=v})
+	end
+	return self
+end
+
+function new (p)
+	local out = {
+		tokens = {},
+		reset  = function (self, input)
+			self.index = 1
+			self.input = input
+		end,
+		next   = function (self)
+			for _, v in ipairs(self.tokens) do
+				local out = {string.find(self.input, v.pattern, self.index)}
+				if out[1] == self.index and out[2] >= out[1] then
+					self.index = out[2]+1
+					return {
+						type  = v.name,
+						start = out[1],
+						stop  = out[2],
+						value = string.sub(self.input, out[1], out[2])
+					}
+				end
+			end
+			-- we didn't find anything
+			return nil -- feed more data! or better data. either one
+		end,
+	}
+	local constructor = {out=out}
+	setmetatable(constructor, meta)
+	return constructor(p)
+end
+
+return new
diff --git a/piecemeal.lua b/piecemeal.lua
new file mode 100644
index 0000000..0f6dd6c
--- /dev/null
+++ b/piecemeal.lua
@@ -0,0 +1,106 @@
+--[[
+	A parser has the signature: (list, sindex) -> eindex [, output]
+	If the parser "fails", it must return exactly nil.
+	If the parser "succeeds", it must return at least the eindex, and may also return some output.
+]]--
+
+return {
+	lexeme = function (type)
+		return function (l, i)
+			if l[i] and l[i].type == type then
+				return i+1, l[i]
+			end
+			return nil
+		end
+	end,
+	value = function (value)
+		return function (l, i)
+			if l[i] and l[i].value == value then
+				return i+1, l[i]
+			end
+			return nil
+		end
+	end,
+	all = function (...)
+		local args = {...}
+		return function (l, i)
+			local res = {}
+			local eind, out
+			for _, v in ipairs(args) do
+				eind, out = v(l, i)
+				if not eind then return nil end
+				i = eind
+				table.insert(res, out)
+			end
+			return i, res
+		end
+	end,
+	alt = function (...)
+		local args = {...}
+		return function (l, i)
+			local eind, out
+			for _, v in ipairs(args) do
+				eind, out = v(l, i)
+				if eind then
+					return eind, out
+				end
+			end
+			return nil
+		end
+	end,
+	opt = function (p)
+		return function (l, i)
+			local eind, out = p(l, i)
+			if eind
+				then return eind, out
+				else return i
+			end
+		end
+	end,
+	plus = function (p)
+		return function (l, i)
+			local res = {}
+			local eind, out = p(l, i)
+			if not eind then return nil end
+			repeat
+				i = eind
+				table.insert(res, out)
+				eind, out = p(l, i)
+			until not eind
+			return i, res
+		end
+	end,
+	star = function (p)
+		return function (l, i)
+			local res = {}
+			local eind, out
+			repeat
+				eind, out = p(l, i)
+				if eind then
+					table.insert(res, out)
+					i = eind
+				end
+			until not eind
+			return i, res
+		end
+	end,
+
+	postp = function (p, f)
+		return function (l, i)
+			local eind, out = p(l, i)
+			if out then out = f(out) end
+			return eind, out
+		end
+	end,
+
+	parse = function (text, lexer, parser)
+		local tokens = {}
+		lexer:reset(text)
+		local next
+		repeat
+			next = lexer:next()
+			if next then table.insert(tokens, next) end
+		until next == nil
+		return parser(tokens, 1)
+	end,
+}