Zig's current Parser
Look it up yourself :)
Andrew Kelley - Practical DOD
https://vimeo.com/649009599
export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {
const newlines: u16 = @bitCast(@as(@Vector(16, u8), chunk) == @as(@Vector(16, u8), @splat('\n')));
const ones = 0x1111111111111111;
const ascending_indices = 0xFEDCBA9876543210;
const restart_nibbles_mask = pdep(newlines, ones ^ 1) *% 0xF;
const restart_nibbles_indices = pext(ascending_indices, restart_nibbles_mask);
const prefix_diff = restart_nibbles_indices -% (restart_nibbles_indices << 4);
const vec: @Vector(8, u8) = @bitCast(ascending_indices -% pdep(prefix_diff, restart_nibbles_mask) *% ones);
return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}
export
fn
columnCounts
(
chunk
:
@Vector
{
.kind = .@"export",
.start = 0,
.end = 6,
}
{
.kind = .@"fn",
.start = 7,
.end = 9,
}
{
.kind = .@"identifier",
.start = 10,
.end = 22,
}
{
.kind = .@"(",
.start = 22,
.end = 23,
}
{
.kind = .@"identifier",
.start = 23,
.end = 28,
}
{
.kind = .@":",
.start = 28,
.end = 29,
}
{
.kind = .@"builtin",
.start = 30,
.end = 37,
}
pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
var cur = source[0..];
var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);
while (true) {
switch (cur[0]) {
'a'...'z', 'A'...'Z', '_' => {
const start = cur;
while (true) {
cur = cur[1..];
switch (cur[0]) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
else => break,
}
}
// our token span is start..end
const end = cur;
(try tokens.addOne(allocator)).* = Token{
.kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
.start = start,
.end = end,
};
},
'0'...'9' => {
// parse a number
},
'~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
const start = cur;
cur = cur[1..];
const end = cur;
(try tokens.addOne(allocator)).* = Token{
.kind = @bitCast(c),
.start = start,
.end = end,
};
}
'"' => while (true) { // parse a string
cur = cur[1..];
const is_escaped = cur[0] == '\\';
cur = cur[@intFromBool(is_escaped)..];
if (cur[0] == '"' and !is_escaped) break;
},
' ', '\t', '\n', '\r' => cur = cur[1..],
0 => return tokens,
}
}
}
source:
export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {
pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
var cur = source[0..];
var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);
while (true) {
switch (cur[0]) {
'a'...'z', 'A'...'Z', '_' => {
const start = cur;
while (true) {
const V = @Vector(@bitSizeOf(usize), u8);
const vec: V = cur[0..@sizeOf(V)].*;
const identifier_bitstring =
(@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
(@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
(@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
(@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));
cur = cur[@ctz(~identifier_bitstring)..];
if (~identifier_bitstring != 0) break;
}
// our token span is start..end
const end = cur;
(try tokens.addOne(allocator)).* = Token{
.kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
.start = start,
.end = end,
};
},
'0'...'9' => {
// parse a number
},
'~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
const start = cur;
cur = cur[1..];
const end = cur;
(try tokens.addOne(allocator)).* = Token{
.kind = @bitCast(c),
.start = start,
.end = end,
};
}
'"' => while (true) { // parse a string
cur = cur[1..];
const is_escaped = cur[0] == '\\';
cur = cur[@intFromBool(is_escaped)..];
if (cur[0] == '"' and !is_escaped) break;
},
' ', '\t', '\n', '\r' => cur = cur[1..],
0 => return tokens,
}
}
}
source:
export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {
export fn maskForAlphanumericAndUnderscoresBetterSWAR(src: [*]align(8) const u8) u64 {
const ones: u64 = 0x0101010101010101;
const mask = comptime ones * 0x7F;
var result: u64 = 0;
inline for (0..8) |i| {
const v: u64 = @bitCast(src[i*8 ..][0..8].*);
const low_7_bits = v & mask;
const non_underscore = (low_7_bits ^ comptime ones * '_') + mask;
// alpha check:
// Upper 3 bits must be 4 or 6. Lower 5 bits must be in 1...26
const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
const non_0x40_or_0x60 = magic_mask + mask;
const non_alpha = magic_mask + comptime ones * (0x80 - 27);
// digit check:
// Upper nibble must be 3. Lower nibble must be in 0...9
const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
const non_digit = flipped_0x30 + comptime ones * (0x80 - 0xA);
const final = 0x8080808080808080 &
(~v &
((non_digit & non_alpha & non_underscore) ^ non_0x40_or_0x60)
);
comptime var mult: u64 = 0;
comptime for (0..8) |j| { mult |= @as(u64, 1) << (7 * j); };
const movmask = (final *% mult) >> 56;
result |= movmask << (8 * i);
}
return result;
}
&
^
+
*
+
fn maskForUnderscoresBetterArm(src: [*]const u8) u64 {
const indices = std.simd.iota(u6, 16) * @splat(4);
var vec0 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(0));
var vec1 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(1));
var vec2 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(2));
var vec3 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(3));
const false_vec = std.mem.zeroes(@Vector(16, u8));
const true_vec = ~false_vec;
vec0 = @select(u8, vec0 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
vec1 = @select(u8, vec1 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
vec2 = @select(u8, vec2 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
vec3 = @select(u8, vec3 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
const t0 = (vec1 & @splat(0b10000000) | (vec0 >> @splat(1));
const t1 = (vec3 & @splat(0b10000000) | (vec2 >> @splat(1));
const t2 = ( t1 & @splat(0b11000000) | (t0 >> @splat(2));
const t3 = ( t2 & @splat(0b11110000) | (t2 >> @splat(4));
var t4: @Vector(8, u16) = @bitCast(t3);
t4 >>= @splat(4);
const t5: @Vector(16, u8) = @bitCast(t4);
return @bitCast(@shuffle(u8, t5, undefined, [_]i32{ 0, 2, 4, 6, 8, 10, 12, 14 }));
}
src
{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}
{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}
{1,5, 9,13,17,21,25,29,33,37,41,45,49,53,57,61}
{2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62}
{3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63}
pub fn tokenize(gpa: Allocator, source: [:0]align(@bitSizeOf(uword)) const u8) ![]Token {
const end_ptr = &source.ptr[source.len];
const extended_source_len = std.mem.alignForward(usize, source.len + EXTENDED_BACK_SENTINELS_LEN, @bitSizeOf(uword));
const extended_source = source.ptr[0..extended_source_len];
const tokens = try gpa.alloc(Token, extended_source_len);
errdefer gpa.free(tokens);
var cur_token = tokens;
var prev: []const u8 = extended_source;
var cur = prev[0..];
var bitstrings: [3]uword = undefined;
var op_type: Tag = .whitespace;
var selected_bitmap: *uword = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
var bitmap_index = @intFromPtr(source.ptr);
var utf8_checker: Utf8Checker = .{};
bitmap_index -%= @bitSizeOf(uword);
while (true) : (cur = cur[1..]) {
var aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
while (true) {
while (bitmap_index != aligned_ptr) {
bitmap_index +%= @bitSizeOf(uword);
const V = @Vector(@bitSizeOf(uword), u8);
const vec: V = @as([*]align(@bitSizeOf(uword)) const u8, @ptrFromInt(bitmap_index))[0..@bitSizeOf(uword)].*;
try utf8_checker.validateChunk(vec);
bitstrings[0] = @as(usize, @bitCast(vec != @as(V, @splat('\x7F')))) &
(@as(usize, @bitCast(vec >= @as(V, @splat(' ')))) | @as(usize, @bitCast(vec == @as(V, @splat('\t')))));
bitstrings[1] uword =
@as(usize, @bitCast(vec == @as(V, @splat('\t')))) |
@as(usize, @bitCast(vec == @as(V, @splat('\n')))) |
@as(usize, @bitCast(vec == @as(V, @splat(' '))));
bitstrings[2]: uword = (@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
(@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
(@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
(@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));
}
const cur_misalignment: std.math.Log2Int(uword) = @truncate(@intFromPtr(cur.ptr));
const bitstring = selected_bitmap.* >> cur_misalignment;
cur = cur[@ctz(~bitstring)..];
// If we made it to the end of chunk(s), wrap around, grab more bits, and start again
aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
if (bitmap_index == aligned_ptr) break;
}
if (op_type == .eof) break;
while (true) {
var len: u32 = @intCast(@intFromPtr(cur.ptr) - @intFromPtr(prev.ptr));
switch (prev[0]) {
'a'...'z' => if (Keywords.lookup(prev.ptr, len)) |op_kind| {
op_type = op_kind;
const is_space = @intFromBool(cur[0] == ' ');
cur = cur[is_space..];
len += is_space;
},
else => {},
}
cur_token[0] = .{ .len = len, .kind = op_type };
cur_token = cur_token[1..];
prev = cur;
if (cur[0] == '@') {
cur = cur[1..];
op_type = switch (cur[0]) {
'a'...'z', 'A'...'Z' => .builtin,
'"' => .string_identifier,
else => return error.MissingQuoteOrLetterAfterAtSymbol,
};
selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
} else if (Operators.isSingleCharOp(cur[0])) {
selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(Tag.whitespace)))];
op_type = Operators.hashOp(Operators.getOpWord(cur.ptr, 1));
} else {
op_type = switch (cur[0]) {
'a'...'z', 'A'...'Z', '_' => .identifier,
'0'...'9' => .number,
'\'' => .char_literal,
'"' => .string,
' ', '\t', '\n' => .whitespace,
0 => .eof,
else => .unknown,
};
selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
}
if (cur[0] != '\'' and cur[0] != '"') break;
while (true) {
cur = cur[1..];
const is_escaped = cur[0] == '\\';
cur = cur[@intFromBool(is_escaped)..];
if ((cur[0] == chr and !is_escaped) or cur[0] < ' ') break;
}
}
}
if (@intFromPtr(cur.ptr) < @intFromPtr(end_ptr)) return error.Found0ByteInFile;
cur_token[0] = .{ .len = 1, .kind = .eof };
return tokens;
}
fn ctz(a: u64) u8 {
if (a == 0) return 64;
const x = a & (~a +% 1); // a & -a
const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
comptime var lookup_table: [64]u8 = undefined;
comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
return lookup_table[(deBruijn *% x) >> 58];
}
fn cto(a: u64) u8 {
if (a == -1) return 64;
const not_a = ~a;
const x = not_a & (~not_a +% 1); // ~a & -~a
const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
comptime var lookup_table: [64]u8 = undefined;
comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
return lookup_table[(deBruijn *% x) >> 58];
}
define dso_local noundef i64 @_Z3ct1m(i64 noundef %0) local_unnamed_addr #0 {
%2 = xor i64 %0, -1
%3 = tail call noundef i64 @llvm.cttz.i64(i64 %2, i1 false), !range !9
ret i64 %3
}
pub fn lookup(kw: [*]const u8, len: u32) ?Tag {
assert(len != 0);
const raw_hash = hashKw(kw, len);
const hash = mapToIndex(raw_hash);
comptime assert(max_kw_len < '0' and '0' > 16 - 1);
const KW_VEC = @Vector(16, u8);
const vec1: KW_VEC = sorted_padded_kws[hash];
const vec2: KW_VEC = kw[0..16].*; // the safety of this operation is validated at the top of this function
const len_vec: KW_VEC = @splat(@as(u8, @truncate(len)));
const cd = @select(u8, len_vec > std.simd.iota(u8, 16), vec2, len_vec);
if (std.simd.countTrues(cd != vec1) == 0) {
return @enumFromInt(~hash);
} else {
return null;
}
}
pub fn hashKw(kw: [*]const u8, len: u32) u7 {
const a = std.mem.readInt(u16, kw[0..2], .little);
const b = std.mem.readInt(u16, (kw - 2 + @as(usize, @intCast(len)))[0..2], .little);
return @truncate(((a ^ (len << 14)) *% b) >> 8);
}
fn mapToIndex(raw_hash: u7) u8 {
const mask = (@as(u64, 1) << @truncate(raw_hash)) -% 1;
return (if (raw_hash >= 64) @popCount(masks[0]) else 0) + @popCount(mask & masks[raw_hash / 64]);
}
const masks = [2]u64{
0b0000001000000110001011010011101000001000010010001011011000100100,
0b0001011010100000000001100000111101100010110011111001100110011100,
};
const sorted_padded_kws = [50][16]u8{
("extern" ++ "\x06" ** 10).*,
("while" ++ "\x05" ** 11).*,
("align" ++ "\x05" ** 11).*,
("else" ++ "\x04" ** 12).*,
("or" ++ "\x02" ** 14).*,
("break" ++ "\x05" ** 11).*,
("anytype" ++ "\x07" ** 9).*,
("volatile" ++ "\x08" ** 8).*,
("async" ++ "\x05" ** 11).*,
("errdefer" ++ "\x08" ** 8).*,
("catch" ++ "\x05" ** 11).*,
("test" ++ "\x04" ** 12).*,
("struct" ++ "\x06" ** 10).*,
("const" ++ "\x05" ** 11).*,
("return" ++ "\x06" ** 10).*,
("await" ++ "\x05" ** 11).*,
("resume" ++ "\x06" ** 10).*,
("opaque" ++ "\x06" ** 10).*,
("packed" ++ "\x06" ** 10).*,
("orelse" ++ "\x06" ** 10).*,
("var" ++ "\x03" ** 13).*,
("unreachable" ++ "\x0b" ** 5).*,
("threadlocal" ++ "\x0b" ** 5).*,
("anyframe" ++ "\x08" ** 8).*,
("noinline" ++ "\x08" ** 8).*,
("defer" ++ "\x05" ** 11).*,
("try" ++ "\x03" ** 13).*,
("pub" ++ "\x03" ** 13).*,
("usingnamespace" ++ "\x0e" ** 2).*,
("fn" ++ "\x02" ** 14).*,
("and" ++ "\x03" ** 13).*,
("allowzero" ++ "\x09" ** 7).*,
("error" ++ "\x05" ** 11).*,
("addrspace" ++ "\x09" ** 7).*,
("if" ++ "\x02" ** 14).*,
("nosuspend" ++ "\x09" ** 7).*,
("linksection" ++ "\x0b" ** 5).*,
("inline" ++ "\x06" ** 10).*,
("export" ++ "\x06" ** 10).*,
("asm" ++ "\x03" ** 13).*,
("noalias" ++ "\x07" ** 9).*,
("suspend" ++ "\x07" ** 9).*,
("switch" ++ "\x06" ** 10).*,
("union" ++ "\x05" ** 11).*,
("enum" ++ "\x04" ** 12).*,
("continue" ++ "\x08" ** 8).*,
("for" ++ "\x03" ** 13).*,
("callconv" ++ "\x08" ** 8).*,
("comptime" ++ "\x08" ** 8).*,
.{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
};
kw = "export", len = 6
hash = 38
raw_hash = 96
raw_hash = 96 & 63 => 32
const mask = (1 << 32) -% 1
add that 17 to @popCount(masks[0]), known at compile-time to be 21
fn advanceSourceCursor(astgen: *AstGen, end: usize) void {
const source = astgen.tree.source;
var i = astgen.source_offset;
var line = astgen.source_line;
var column = astgen.source_column;
assert(i <= end);
while (i < end) : (i += 1) {
if (source[i] == '\n') {
line += 1;
column = 0;
} else {
column += 1;
}
}
astgen.source_offset = i;
astgen.source_line = line;
astgen.source_column = column;
}
Daniel Lemire - Parsing JSON Really Quickly: Lessons Learned
https://www.youtube.com/watch?v=wlvKAT7SZIQ
Andrew Kelley - Practical DOD
https://vimeo.com/649009599
Let’s Build A Simple Interpreter. Part 7: Abstract Syntax Trees
https://ruslanspivak.com/lsbasi-part7/
fn parse(node: ?*Node) {
if (node == null) return;
// Operate on the Nodes in pre-order
parse(node.left);
// Operate on the Nodes in in-order
parse(node.right);
// Operate on the Nodes in post-order
}
Look it up yourself :)
Simon Gog - Blog
simongog.github.io
struct {
operators: [BUF_SIZE]Token,
merge_bitstring: u64,
next: u32,
operators_len: u8,
bitstr_len: u8,
}
// Example:
// .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
// .merge_bitstring = 1010101010;
struct {
operators: [BUF_SIZE]Token,
merge_bitstring: u64,
next: u32,
operators_len: u8,
bitstr_len: u8,
}
// Example:
// .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
// .merge_bitstring = 1010101010;
// .operands = .{ a, b, c, d, e, f, g, h, i ... }
// Output: a + b * c - d / e + f ...
fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
const splatted = @as(u64, x) * 0x0101010101010101;
const bit_positions: u64 = @bitCast(@as(@Vector(8, u8), @splat(1)) << std.simd.iota(u3, 8));
const splatted_msbs = (splatted & bit_positions) + (0x8080808080808080 - bit_positions);
const splatted_lsbs = (splatted_msbs >> 7) & 0x0101010101010101;
const prefix_sums1 = ((splatted_lsbs << 4) | splatted_lsbs) *% 0x1111111111111110;
const splatted_lsbs_inv = splatted_lsbs ^ 0x0101010101010101;
const prefix_sums2 = ((splatted_lsbs_inv << 4) | splatted_lsbs_inv) *% 0x1111111111111110;
const selector = splatted_lsbs * 255;
const interleaved_shuffle_vector = (selector & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2;
return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
}
[0]00000000000000000000000000000000000000000000000000000000x = 01010111
00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001
*
+
1....... .1...... ..1..... ...1.... ....1... .....1.. ......1. .......1
&
0....... 01...... 011..... 0111.... 01111... 011111.. 0111111. 01111111
+
|
*
+
splatted_lsbs =
selector =
prefix_sums1 =
prefix_sums2 =
( 1 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2
( 0 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2
≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈
00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001
^
|
*
+
*
+
fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
const splatted = pdep(x, 0x0101010101010101);
const vec = splatted * 255;
const nibble_indices: u64 = @bitCast(std.simd.iota(u4, 16));
const interleaved_shuffle_vector = pdep(nibble_indices, vec) | pdep(nibble_indices, ~vec);
return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
}
00000000000000000000000000000000000000000000000000000000000x = 01010111
01010111
*
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
uh-we-encountered-a-problem
+
uh-we-encountered-a-problem
11111110 11011100 10111010 ........ 10011000 ........ 01110110 ........ 01010100 00110010 00010000
---F---E ---D---C ---B---A ---9---8 ---7---6 ---5---4 ---3---2 ---1---0
11111110 11011100 10111010 10011000 01110110 01010100 ........ 00110010 ........ 00010000 ........ ........ ........
|
01010100 10011000 00110010 01110110 00010000 01010100 00110010 00010000
---5---4 ---9---8 ---3---2 ---7---6 ---1---0 ---5---4 ---3---2 ---1---0