Starting soon!

ACCELERATING THE ZIG PARSERwith Niles Salter / @Validark

But first.. a recap of part I

& things to know

We discussed how computers can get faster

identified fundamental problems that computers must face in the physical universe
derived optimization strategies that should be useful across (good) hardware, across time

reduce cache misses

sometimes makes O(n log n) scale better than O(n)

Performance engineering case study: heap construction
https://dl.acm.org/doi/pdf/10.1145/351827.384257

reduce branch misprediction
eliminate unnecessary instructions, fewer is usually better
improve data-level parallelism

SIMD/SWAR - Single instruction, multiple data & SIMD within a register

do work in the pass where it is cheapest to do it

The Zig compiler basics

Converts code in text files into executable binaries

Andrew Kelley - Practical DOD
https://vimeo.com/649009599

Intro to
tokenization

export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {
    const newlines: u16 = @bitCast(@as(@Vector(16, u8), chunk) == @as(@Vector(16, u8), @splat('\n')));
    const ones = 0x1111111111111111;
    const ascending_indices = 0xFEDCBA9876543210;
    const restart_nibbles_mask = pdep(newlines, ones ^ 1) *% 0xF;
    const restart_nibbles_indices = pext(ascending_indices, restart_nibbles_mask);
    const prefix_diff = restart_nibbles_indices -% (restart_nibbles_indices << 4);
    const vec: @Vector(8, u8) = @bitCast(ascending_indices -% pdep(prefix_diff, restart_nibbles_mask) *% ones);
    return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}

export

fn

columnCounts

chunk

@Vector

{
    .kind = .@"export",
    .start = 0,
    .end = 6,
}

{
    .kind = .@"fn",
    .start = 7,
    .end = 9,
}

{
    .kind = .@"identifier",
    .start = 10,
    .end = 22,
}

{
    .kind = .@"(",
    .start = 22,
    .end = 23,
}

{
    .kind = .@"identifier",
    .start = 23,
    .end = 28,
}

{
    .kind = .@":",
    .start = 28,
    .end = 29,
}

{
    .kind = .@"builtin",
    .start = 30,
    .end = 37,
}

A basic tokenizer

pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
    var cur = source[0..];
    var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);

    while (true) {
        switch (cur[0]) {
            'a'...'z', 'A'...'Z', '_' => {
                const start = cur;

                while (true) {
                    cur = cur[1..];
                    switch (cur[0]) {
                        'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                        else => break,
                    }
                }

                // our token span is start..end
                const end = cur;

                (try tokens.addOne(allocator)).* = Token{
                    .kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
                    .start = start,
                    .end = end,
                };
            },
            '0'...'9' => {
                // parse a number
            },
            '~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
                const start = cur;
                cur = cur[1..];
                const end = cur;
                (try tokens.addOne(allocator)).* = Token{
                    .kind = @bitCast(c),
                    .start = start,
                    .end = end,
                };
            }
            '"' => while (true) { // parse a string
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if (cur[0] == '"' and !is_escaped) break;
            },
            ' ', '\t', '\n', '\r' => cur = cur[1..],
            0 => return tokens,
        }
    }
}

source:

export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {

A better tokenizer

pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
    var cur = source[0..];
    var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);

    while (true) {
        switch (cur[0]) {
            'a'...'z', 'A'...'Z', '_' => {
                const start = cur;

                while (true) {
                    const V = @Vector(@bitSizeOf(usize), u8);
                    const vec: V = cur[0..@sizeOf(V)].*;

                    const identifier_bitstring =
                        (@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));

                    cur = cur[@ctz(~identifier_bitstring)..];
                    if (~identifier_bitstring != 0) break;
                }

                // our token span is start..end
                const end = cur;

                (try tokens.addOne(allocator)).* = Token{
                    .kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
                    .start = start,
                    .end = end,
                };
            },
            '0'...'9' => {
                // parse a number
            },
            '~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
                const start = cur;
                cur = cur[1..];
                const end = cur;
                (try tokens.addOne(allocator)).* = Token{
                    .kind = @bitCast(c),
                    .start = start,
                    .end = end,
                };
            }
            '"' => while (true) { // parse a string
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if (cur[0] == '"' and !is_escaped) break;
            },
            ' ', '\t', '\n', '\r' => cur = cur[1..],
            0 => return tokens,
        }
    }
}

source:

export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {

Bitstring generation Emit

Basic tokenization loop emit

SWAR emulation

export fn maskForAlphanumericAndUnderscoresBetterSWAR(src: [*]align(8) const u8) u64 {
    const ones: u64 = 0x0101010101010101;
    const mask = comptime ones * 0x7F;
    var result: u64 = 0;

    inline for (0..8) |i| {
        const v: u64 = @bitCast(src[i*8 ..][0..8].*);
        const low_7_bits = v & mask;

        const non_underscore = (low_7_bits ^ comptime ones * '_') + mask;

        // alpha check:
        // Upper 3 bits must be 4 or 6. Lower 5 bits must be in 1...26
        const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
        const non_0x40_or_0x60 = magic_mask + mask;
        const non_alpha = magic_mask + comptime ones * (0x80 - 27);

        // digit check:
        // Upper nibble must be 3. Lower nibble must be in 0...9
        const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
        const non_digit = flipped_0x30 + comptime ones * (0x80 - 0xA);

        const final = 0x8080808080808080 &
            (~v &
                ((non_digit & non_alpha & non_underscore) ^ non_0x40_or_0x60)
            );

        comptime var mult: u64 = 0;
        comptime for (0..8) |j|  { mult |= @as(u64, 1) << (7 * j); };

        const movmask = (final *% mult) >> 56;
        result |= movmask << (8 * i);
    }

    return result;
}

Bitstring generation Emit SWAR

Interleaved Movemask

fn maskForUnderscoresBetterArm(src: [*]const u8) u64 {
    const indices = std.simd.iota(u6, 16) * @splat(4);

    var vec0 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(0));
    var vec1 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(1));
    var vec2 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(2));
    var vec3 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(3));

    const false_vec = std.mem.zeroes(@Vector(16, u8));
    const true_vec = ~false_vec;

    vec0 = @select(u8, vec0 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec1 = @select(u8, vec1 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec2 = @select(u8, vec2 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec3 = @select(u8, vec3 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);

    const t0 = (vec1 & @splat(0b10000000) | (vec0 >> @splat(1));
    const t1 = (vec3 & @splat(0b10000000) | (vec2 >> @splat(1));
    const t2 = (  t1 & @splat(0b11000000) | (t0 >> @splat(2));
    const t3 = (  t2 & @splat(0b11110000) | (t2 >> @splat(4));

    var t4: @Vector(8, u16) = @bitCast(t3);
    t4 >>= @splat(4);
    const t5: @Vector(16, u8) = @bitCast(t4);
    return @bitCast(@shuffle(u8, t5, undefined, [_]i32{ 0, 2, 4, 6, 8, 10, 12, 14 }));
}

src

{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}

{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}

{1,5, 9,13,17,21,25,29,33,37,41,45,49,53,57,61}

{2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62}

{3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63}

Interleaved Movemask Emit

My tokenizer design

Observation 1: We can reuse bitstrings!

Instead of recomputing the bitstrings every time we look at the beginning of a token, let's just compute the bitstrings once for every
chunk of 64 characters.
This also fits nicely with the fact that we have to do utf8-validation for each chunk.
Unfortunately, this means we are going to speculatively produce bitstrings whether we need them or not, for every single chunk!

Bitstrings produced speculatively:

whitespace: '\n', 't', ' '
identifier/number matcher: [A-Za-z0-9_]
control characters: should just be '\n' (used for jumping to the end of comments or single-line strings)

Bitstrings produced on-demand:

operator-continuation characters: [!%*+./<=>?\\|]
unescaped string / char_literal characters: ['"]

just too rare in practice to be a meaningful optimization for Zig

Idealized tokenizer code

pub fn tokenize(gpa: Allocator, source: [:0]align(@bitSizeOf(uword)) const u8) ![]Token {
    const end_ptr = &source.ptr[source.len];
    const extended_source_len = std.mem.alignForward(usize, source.len + EXTENDED_BACK_SENTINELS_LEN, @bitSizeOf(uword));

    const extended_source = source.ptr[0..extended_source_len];
    const tokens = try gpa.alloc(Token, extended_source_len);
    errdefer gpa.free(tokens);

    var cur_token = tokens;
    var prev: []const u8 = extended_source;
    var cur = prev[0..];

    var bitstrings: [3]uword = undefined;
    var op_type: Tag = .whitespace;
    var selected_bitmap: *uword = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
    var bitmap_index = @intFromPtr(source.ptr);
    var utf8_checker: Utf8Checker = .{};

    bitmap_index -%= @bitSizeOf(uword);

    while (true) : (cur = cur[1..]) {
        var aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
        while (true) {
            while (bitmap_index != aligned_ptr) {
                bitmap_index +%= @bitSizeOf(uword);
                const V = @Vector(@bitSizeOf(uword), u8);
                const vec: V = @as([*]align(@bitSizeOf(uword)) const u8, @ptrFromInt(bitmap_index))[0..@bitSizeOf(uword)].*;

                try utf8_checker.validateChunk(vec);

                bitstrings[0] = @as(usize, @bitCast(vec != @as(V, @splat('\x7F')))) &
                    (@as(usize, @bitCast(vec >= @as(V, @splat(' ')))) | @as(usize, @bitCast(vec == @as(V, @splat('\t')))));

                bitstrings[1] uword =
                    @as(usize, @bitCast(vec == @as(V, @splat('\t')))) |
                    @as(usize, @bitCast(vec == @as(V, @splat('\n')))) |
                    @as(usize, @bitCast(vec == @as(V, @splat(' '))));

                bitstrings[2]: uword = (@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));
            }

            const cur_misalignment: std.math.Log2Int(uword) = @truncate(@intFromPtr(cur.ptr));
            const bitstring = selected_bitmap.* >> cur_misalignment;
            cur = cur[@ctz(~bitstring)..];

            // If we made it to the end of chunk(s), wrap around, grab more bits, and start again
            aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
            if (bitmap_index == aligned_ptr) break;
        }

        if (op_type == .eof) break;

        while (true) {
            var len: u32 = @intCast(@intFromPtr(cur.ptr) - @intFromPtr(prev.ptr));

            switch (prev[0]) {
                'a'...'z' => if (Keywords.lookup(prev.ptr, len)) |op_kind| {
                    op_type = op_kind;
                    const is_space = @intFromBool(cur[0] == ' ');
                    cur = cur[is_space..];
                    len += is_space;
                },
                else => {},
            }

            cur_token[0] = .{ .len = len, .kind = op_type };
            cur_token = cur_token[1..];
            prev = cur;

            if (cur[0] == '@') {
                cur = cur[1..];
                op_type = switch (cur[0]) {
                    'a'...'z', 'A'...'Z' => .builtin,
                    '"' => .string_identifier,
                    else => return error.MissingQuoteOrLetterAfterAtSymbol,
                };

                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
            } else if (Operators.isSingleCharOp(cur[0])) {
                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(Tag.whitespace)))];
                op_type = Operators.hashOp(Operators.getOpWord(cur.ptr, 1));
            } else {
                op_type = switch (cur[0]) {
                    'a'...'z', 'A'...'Z', '_' => .identifier,
                    '0'...'9' => .number,
                    '\'' => .char_literal,
                    '"' => .string,
                    ' ', '\t', '\n' => .whitespace,
                    0 => .eof,
                    else => .unknown,
                };

                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
            }

            if (cur[0] != '\'' and cur[0] != '"') break;

            while (true) {
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if ((cur[0] == chr and !is_escaped) or cur[0] < ' ') break;
            }
        }
    }

    if (@intFromPtr(cur.ptr) < @intFromPtr(end_ptr)) return error.Found0ByteInFile;
    cur_token[0] = .{ .len = 1, .kind = .eof };
    return tokens;
}

How do we emulate
count-trailing-zeroes?

fn ctz(a: u64) u8 {
    if (a == 0) return 64;
    const x = a & (~a +% 1); // a & -a
    const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
    comptime var lookup_table: [64]u8 = undefined;
    comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
    return lookup_table[(deBruijn *% x) >> 58];
}

llvm/llvm-project#84763

How do we emulate
count-trailing-ones?

fn cto(a: u64) u8 {
    if (a == -1) return 64;
    const not_a = ~a;
    const x = not_a & (~not_a +% 1); // ~a & -~a
    const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
    comptime var lookup_table: [64]u8 = undefined;
    comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
    return lookup_table[(deBruijn *% x) >> 58];
}

How do we emulate
count-trailing-ones?

define dso_local noundef i64 @_Z3ct1m(i64 noundef %0) local_unnamed_addr #0 {
    %2 = xor i64 %0, -1
    %3 = tail call noundef i64 @llvm.cttz.i64(i64 %2, i1 false), !range !9
    ret i64 %3
}

llvm/llvm-project#82487

Count-trailing-ones on risc-v

Count-trailing-ones on sparc

Count-trailing-ones cross-platform

Optimization: Use Vector shuffles to
match whitespace

Idea from: https://www.youtube.com/watch?v=R_yX0XjdSBY&t=2058s

Mask Whitespace with shuffle emit

Keyword Lookups

pub fn lookup(kw: [*]const u8, len: u32) ?Tag {
    assert(len != 0);

    const raw_hash = hashKw(kw, len);
    const hash = mapToIndex(raw_hash);

    comptime assert(max_kw_len < '0' and '0' > 16 - 1);

    const KW_VEC = @Vector(16, u8);
    const vec1: KW_VEC = sorted_padded_kws[hash];
    const vec2: KW_VEC = kw[0..16].*; // the safety of this operation is validated at the top of this function
    const len_vec: KW_VEC = @splat(@as(u8, @truncate(len)));
    const cd = @select(u8, len_vec > std.simd.iota(u8, 16), vec2, len_vec);

    if (std.simd.countTrues(cd != vec1) == 0) {
        return @enumFromInt(~hash);
    } else {
        return null;
    }
}

pub fn hashKw(kw: [*]const u8, len: u32) u7 {
    const a = std.mem.readInt(u16, kw[0..2], .little);
    const b = std.mem.readInt(u16, (kw - 2 + @as(usize, @intCast(len)))[0..2], .little);
    return @truncate(((a ^ (len << 14)) *% b) >> 8);
}

fn mapToIndex(raw_hash: u7) u8 {
    const mask = (@as(u64, 1) << @truncate(raw_hash)) -% 1;
    return (if (raw_hash >= 64) @popCount(masks[0]) else 0) + @popCount(mask & masks[raw_hash / 64]);
}

const masks = [2]u64{
    0b0000001000000110001011010011101000001000010010001011011000100100,
    0b0001011010100000000001100000111101100010110011111001100110011100,
};

const sorted_padded_kws = [50][16]u8{
    ("extern"         ++ "\x06" ** 10).*,
    ("while"          ++ "\x05" ** 11).*,
    ("align"          ++ "\x05" ** 11).*,
    ("else"           ++ "\x04" ** 12).*,
    ("or"             ++ "\x02" ** 14).*,
    ("break"          ++ "\x05" ** 11).*,
    ("anytype"        ++ "\x07" **  9).*,
    ("volatile"       ++ "\x08" **  8).*,
    ("async"          ++ "\x05" ** 11).*,
    ("errdefer"       ++ "\x08" **  8).*,
    ("catch"          ++ "\x05" ** 11).*,
    ("test"           ++ "\x04" ** 12).*,
    ("struct"         ++ "\x06" ** 10).*,
    ("const"          ++ "\x05" ** 11).*,
    ("return"         ++ "\x06" ** 10).*,
    ("await"          ++ "\x05" ** 11).*,
    ("resume"         ++ "\x06" ** 10).*,
    ("opaque"         ++ "\x06" ** 10).*,
    ("packed"         ++ "\x06" ** 10).*,
    ("orelse"         ++ "\x06" ** 10).*,
    ("var"            ++ "\x03" ** 13).*,
    ("unreachable"    ++ "\x0b" **  5).*,
    ("threadlocal"    ++ "\x0b" **  5).*,
    ("anyframe"       ++ "\x08" **  8).*,
    ("noinline"       ++ "\x08" **  8).*,
    ("defer"          ++ "\x05" ** 11).*,
    ("try"            ++ "\x03" ** 13).*,
    ("pub"            ++ "\x03" ** 13).*,
    ("usingnamespace" ++ "\x0e" **  2).*,
    ("fn"             ++ "\x02" ** 14).*,
    ("and"            ++ "\x03" ** 13).*,
    ("allowzero"      ++ "\x09" **  7).*,
    ("error"          ++ "\x05" ** 11).*,
    ("addrspace"      ++ "\x09" **  7).*,
    ("if"             ++ "\x02" ** 14).*,
    ("nosuspend"      ++ "\x09" **  7).*,
    ("linksection"    ++ "\x0b" **  5).*,
    ("inline"         ++ "\x06" ** 10).*,
    ("export"         ++ "\x06" ** 10).*,
    ("asm"            ++ "\x03" ** 13).*,
    ("noalias"        ++ "\x07" **  9).*,
    ("suspend"        ++ "\x07" **  9).*,
    ("switch"         ++ "\x06" ** 10).*,
    ("union"          ++ "\x05" ** 11).*,
    ("enum"           ++ "\x04" ** 12).*,
    ("continue"       ++ "\x08" **  8).*,
    ("for"            ++ "\x03" ** 13).*,
    ("callconv"       ++ "\x08" **  8).*,
    ("comptime"       ++ "\x08" **  8).*,
    .{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
};

kw = "export", len = 6

hash = 38

raw_hash = 96

raw_hash = 96 & 63 => 32

const mask = (1 << 32) -% 1

add that 17 to @popCount(masks[0]), known at compile-time to be 21

Lookup emit

RISC-V SWAR emit

Passive production of
newline bitstrings

I noticed the following code in Zig's AstGen.zig:

fn advanceSourceCursor(astgen: *AstGen, end: usize) void {
    const source = astgen.tree.source;
    var i = astgen.source_offset;
    var line = astgen.source_line;
    var column = astgen.source_column;
    assert(i <= end);
    while (i < end) : (i += 1) {
        if (source[i] == '\n') {
            line += 1;
            column = 0;
        } else {
            column += 1;
        }
    }
    astgen.source_offset = i;
    astgen.source_line = line;
    astgen.source_column = column;
}

Want to learn more
about the tokenizer?

Our SIMD on-demand unescaped-quote matcher was designed by the creators of simdjson.
The utf8-validator was ported from simdjson / simdutf by travisstaloch and myself.

I wrote equivalent SWAR routines so machines without proper vectors can exploit data-level parallelism too!

Daniel Lemire - Parsing JSON Really Quickly: Lessons Learned
https://www.youtube.com/watch?v=wlvKAT7SZIQ

Zig Compiler Internals II

Andrew Kelley - Practical DOD
https://vimeo.com/649009599

I. Operator Precedence Encoder

Part II: The Parser

Converts tokens into a tree-like represenation called the Abstract Syntax Tree

This is the stage where we validate that the source code fits the "language grammar"
We also figure out the order in which operations are supposed to be performed.

We need to deal with every operator in the language, including `try`, `catch`, the comparison operators, `.*`, `&`, etc.
We also disambiguate between the multiple ways an operator can be interpreted.

For instance, `-` can be a binary subtraction or a unary negative based on context.

AST Demo

ts-ast-viewer

Let’s Build A Simple Interpreter. Part 7: Abstract Syntax Trees
https://ruslanspivak.com/lsbasi-part7/

Pre-order vs
in-order vs
post-order in code

fn parse(node: ?*Node) {
    if (node == null) return;

    // Operate on the Nodes in pre-order
    parse(node.left);
    // Operate on the Nodes in in-order
    parse(node.right);
    // Operate on the Nodes in post-order
}

Zig's current Parser

Look it up yourself :)

Chandler Carruth's strategy

Modernizing Compiler Design for Carbon Toolchain - Chandler Carruth - CppNow 2023

Shunting yard

https://en.wikipedia.org/wiki/Shunting_yard_algorithm

More memory efficient strategies

We make the Tokenizer emit 2 bytes per token. 1 byte for a kind enum, 1 byte for a length

If the length is >255, we store a 0 as the length, and let the next 4 bytes be the true length.
This makes the worst case memory consumption scenario a lot better. Now our 1 byte tokens will be mapped to 2 bytes, and our 256+ byte tokens will be mapped to 6 bytes.
Drawbacks:

All bytes in our source file have to map to a token, even whitespace and comments.

Luckily, for most of our tokens, the length is irrelevant to later passes. We only care about the length of identifiers, strings, numbers, and data that we actually need to look at later.
A `return` is a `return`, and we do not need any further information beyond that.

We have to visit all tokens so that we can properly add their length to our cursor.

We don't really want to skip over anyone's hard-fought, artisan-crafted code anyway

Benefits:

We can dramatically reduce memory usage!
We can dramatically reduce memory usage!
Did I mention we can dramatically reduce memory usage, like by a lot?!

Succinct vs implicit

We want to store our trees without pointers to avoid O(n log n) bits being used.
There are two classes of data structures which allow us to escape that regime, succinct and implicit.

Simon Gog - Blog
simongog.github.io

Choosing implicit

Disambiguation makes it so all tokens have a known arity, and are known to be a prefix/infix/suffix operator
Therefore, an implicit representation can be modeled as a simple re-ordering of the tokens that gives us sufficient information to check syntactic correctness and get the correct order of operations out.

Analysis of direct postfix consumption

Inherently, using an algorithm that rearranges data solely by pushing to and popping from a stack, we can only move data to occur later than it did originally.

Prefix operators can be moved to in-order, Infix operators can be moved to post-order, and Postfix operators are going to stay in post-order.
In order to use the `length` trick I want, we cannot use a Post-ordering of the AST/Parse Tree, because we have to know the pre-order, in-order, and post-order to pull it off.

My Design

We model all tokens as either an operator or and operand, and shove it into a specialized version of the Shunting Yard algorithm.
Sometimes, there are some oddities in well-formed Zig code.

E.g. `break: blk thing` has two identifiers next to each other, so the whitespace in between them will get its own token.

Example transformation:

-(-b * -c - -d.*.?) / (e + f) * g + h - -i.*.? * j * (k * (l - m) + n / o);
; - + * / unary - ( - * unary - b unary - c unary - .? .* d ) ( + e f ) g h * * unary - .? .* i j ( + * k ( - l m ) / n o )

Everyone pays the cost of producing in post-order and then iterating over it in pre-order, we want to reduce that cost.
Observation:

Operands never change order with respect to each other. Operators are the ones we are shuffling around.
We can omit operands from the algorithm and decrease the amount of data our algorithm has to deal with by roughly a factor of 2.

The OpString

struct {
    operators: [BUF_SIZE]Token,
    merge_bitstring: u64,
    next: u32,
    operators_len: u8,
    bitstr_len: u8,
}
// Example:
// .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
// .merge_bitstring = 1010101010;

Merging OpStrings

Given a bitstring and two buffers, how do we combine them efficiently?

struct {
    operators: [BUF_SIZE]Token,
    merge_bitstring: u64,
    next: u32,
    operators_len: u8,
    bitstr_len: u8,
}
// Example:
// .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
// .merge_bitstring = 1010101010;
// .operands = .{  a, b, c, d, e, f, g, h, i ... }

// Output: a + b * c - d / e + f ...

There's an instruction for that!

VPEXPANDW

Can we emulate VPEXPANDW?

Use vector shuffle/permute/swizzle instructions!

Expansion & Compaction

BESM Soviet mainframes:

BESM-6 (Designed in 1965. Produced 1968-1987) => AUX & APX

x86-64:

BMI2 (Intel Haswell 2013+, AMD Excavator 2015+, Fast on AMD since Zen 3 2020+) => PDEP & PEXT
AVX-512F (first mainstream support was Intel Cannon Lake 2018+) => VPEXPAND{D, Q} & VPCOMPRESS{D, Q}
AVX512_VBMI2 (Intel Ice Lake 2019+, AMD Zen 4 2022+) => VPEXPAND{B, W} & VPCOMPRESS{B, W}

arm64 / aarch64:

SVE2 (optional, not required) => BDEP & BEXT & BGRP

BGRP = BEXT(a, b) | (BEXT(a, ~b) << @popCount(b))

SVE => ??? & COMPACT & SPLICE

PowerPC:

Power10 => PDEP & PEXT
Power10 => XXGENPCVBM + VPERM (can do vector expansion or compaction, into/from the top or bottom of a vector)

RISC-V:

Zbe => BDEP & BEXT, but did not make it into ratified B instruction set
V => VIOTA+VRGATHER & VCOMPRESS

Where do we have vector
shuffle instructions?

x86-64:

SSSE3 (2006+) => PSHUFB (16 byte-lookups) (Steam hardware survey found 99.73% support)
XOP (AMD 2011-2017, Intel never) => VPPERM (effectively 32 byte-lookups due 2 source
vectors)
AVX2 (2011+) => VPSHUFB (16 byte-lookups, optionally on both halfs of a 32-byte vector
simultaneously but cross-lane shuffling is disallowed)
AVX-512 (Intel 2016+/2018+, AMD Zen 4 2022+) => VPSHUF/VPERM (all your wildest dreams)

for (V)PSHUF{B,W,D,Q}, if bit 7 is set, the corresponding output will be 0, otherwise only the lower 4 bits are used for the lookup

arm64 / aarch64:

NEON => TBL & TBX (at least 16 byte-lookups, but can take up to 4 source vectors, so effectively we can
do 16 lookups in a 64 byte table)

if out of range, the corresponding output will be zeroed (TBL) or the original value preserved (TBX)
arm32 has the same instructions, but the vector sizes are 8 bytes instead of 16.

PowerPC:

VSX => XXPERMX
AltiVec => VPERM (32 byte-lookups)

RISC-V:

V => VRGATHER (at least 16 byte-lookups)

My single-board RISC-V machine, the Star 64 (Sifive U74 CPU) does not have vectors,
and therefore no vector shuffles.

MIPS:

MSA => VSHF (16 byte-lookups)

SPARC:

VIS 2.0 => BSHUFFLE (16 byte-lookups), but you can only keep the first 8 indices

WASM:

SWIZZLE (16 byte-lookups)

How do we produce the
shuffle vector?

Generic:

Could use a lookup table

Problem: lookup tables for expand/compress instructions are often prohibitively large
For 32 byte shuffles, mapping 16 bits => 32 bytes takes 2MiB

Can be worth it in tight loops, but adds bloat to the stack and the executable file

For 16 byte shuffles, mapping 8 bits => 16 bytes takes 4KiB

Could reduce @Vector(16, u8) => @Vector(16, u4), cutting size to 2KiB
at the cost of extra instructions to decode it

PowerPC:

Power10 => XXGENPCVHM

Generate-Permute-Control-Vector-Halfword-according-to-Mask can produce
a vector for expansion or compaction

RISC-V:

V => VIOTA

Predicated prefix-sum instruction
This strategy is applicable across hardware!

fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
    const splatted = @as(u64, x) * 0x0101010101010101;
    const bit_positions: u64 = @bitCast(@as(@Vector(8, u8), @splat(1)) << std.simd.iota(u3, 8));
    const splatted_msbs = (splatted & bit_positions) + (0x8080808080808080 - bit_positions);
    const splatted_lsbs = (splatted_msbs >> 7) & 0x0101010101010101;
    const prefix_sums1 = ((splatted_lsbs << 4) | splatted_lsbs) *% 0x1111111111111110;
    const splatted_lsbs_inv = splatted_lsbs ^ 0x0101010101010101;
    const prefix_sums2 = ((splatted_lsbs_inv << 4) | splatted_lsbs_inv) *% 0x1111111111111110;
    const selector = splatted_lsbs * 255;
    const interleaved_shuffle_vector = (selector & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2;
    return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
}

[0]00000000000000000000000000000000000000000000000000000000x = 01010111

00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001

1....... .1...... ..1..... ...1.... ....1... .....1.. ......1. .......1

0....... 01...... 011..... 0111.... 01111... 011111.. 0111111. 01111111

splatted_lsbs =

     selector =

 prefix_sums1 =

 prefix_sums2 =

 (       1 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2

 (       0 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2

≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈

00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001

More VPEXPANDW emulation?

x86-64:

AVX-512F (first mainstream support was Intel Cannon Lake 2018+) => VPEXPAND{D, Q}

Intel machines before Ice Lake (2019) lack AVX512_VBMI2, which introduced VPEXPAND{B, W}
@Vector(N, u16) => @Vector(N, u32), do our operation, then @Vector(N, u32) => @Vector(N, u16)

BESM Soviet mainframes:

BESM-6 (Designed in 1965. Produced 1968-1987) => AUX & APX

x86-64:

BMI2 (Intel Haswell 2013+, AMD Excavator 2015+, Fast on AMD since Zen 3 2020+) => PDEP & PEXT

arm64 / aarch64:

SVE2 (optional, not required) => BDEP & BEXT & BGRP

BGRP = BEXT(a, b) | (BEXT(a, ~b) << @popCount(b))

PowerPC:

Power10 => PDEP & PEXT

RISC-V:

Zbe => BDEP & BEXT, but did not make it into ratified B instruction set

fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
    const splatted = pdep(x, 0x0101010101010101);
    const vec = splatted * 255;
    const nibble_indices: u64 = @bitCast(std.simd.iota(u4, 16));
    const interleaved_shuffle_vector = pdep(nibble_indices, vec) | pdep(nibble_indices, ~vec);
    return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
}

00000000000000000000000000000000000000000000000000000000000x = 01010111

01010111

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

uh-we-encountered-a-problem

11111110 11011100 10111010 ........ 10011000 ........ 01110110 ........ 01010100 00110010 00010000

---F---E ---D---C ---B---A ---9---8 ---7---6 ---5---4 ---3---2 ---1---0

11111110 11011100 10111010 10011000 01110110 01010100 ........ 00110010 ........ 00010000 ........ ........ ........

01010100 10011000 00110010 01110110 00010000 01010100 00110010 00010000

---5---4 ---9---8 ---3---2 ---7---6 ---1---0 ---5---4 ---3---2 ---1---0

Conclusion

We discussed ways to optimize modern hardware across ISA's, and what compilers could do in the future to help.
We decreased the memory usage and thereby increased the density of the data with regards to the second pass of the compiler.
We made it feasible to combine the next two passes of the compiler, and have it do all the validation we need.

(aside from ensuring that the transformation of the second pass is actually correct)

We hopefully got somebody interested in this stuff!

Starting soon!

ACCELERATING THE ZIG PARSERwith Niles Salter / @Validark

But first.. a recap of part I

& things to know

The Zig compiler basics

Intro totokenization

A basic tokenizer

A better tokenizer

Bitstring generation Emit

Basic tokenization loop emit

SWAR emulation

Bitstring generation Emit SWAR

Interleaved Movemask

Interleaved Movemask Emit

My tokenizer design

Idealized tokenizer code

How do we emulatecount-trailing-zeroes?

How do we emulatecount-trailing-zeroes?

How do we emulatecount-trailing-ones?

How do we emulatecount-trailing-ones?

How do we emulatecount-trailing-ones?

Count-trailing-ones on risc-v

Count-trailing-ones on sparc

Count-trailing-ones cross-platform

Optimization: Use Vector shuffles tomatch whitespace

Mask Whitespace with shuffle emit

Keyword Lookups

Lookup emit

RISC-V SWAR emit

Passive production ofnewline bitstrings

Want to learn moreabout the tokenizer?

Zig Compiler Internals II

I. Operator Precedence Encoder

Part II: The Parser

AST Demo

Pre-order vsin-order vspost-order in code

Zig's current Parser

Chandler Carruth's strategy

Shunting yard

More memory efficient strategies

Succinct vs implicit

Choosing implicit

Analysis of direct postfix consumption

My Design

The OpString

Merging OpStrings

There's an instruction for that!

Can we emulate VPEXPANDW?

Expansion & Compaction

Where do we have vectorshuffle instructions?

How do we produce theshuffle vector?

More VPEXPANDW emulation?

Conclusion

Fin

Intro to
tokenization

How do we emulate
count-trailing-zeroes?

How do we emulate
count-trailing-zeroes?

How do we emulate
count-trailing-ones?

How do we emulate
count-trailing-ones?

How do we emulate
count-trailing-ones?

Optimization: Use Vector shuffles to
match whitespace

Passive production of
newline bitstrings

Want to learn more
about the tokenizer?

Pre-order vs
in-order vs
post-order in code

Where do we have vector
shuffle instructions?

How do we produce the
shuffle vector?